diff --git a/.gitignore b/.gitignore index dee574a..3d96541 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ -.idea/ -target/ -bin/ -.classpath -.project -*.iml +.texpadtmp +*.aux +*.log +*.pdf + diff --git a/Bibliography.bib b/Bibliography.bib new file mode 100644 index 0000000..abcf753 --- /dev/null +++ b/Bibliography.bib @@ -0,0 +1,38 @@ +@article{Ukkonen95, + author = {Ukkonen, Esko}, + journal = {Algorithmica}, + keywords = {string\_matching, suffix\_trees}, + number = {3}, + pages = {249--260}, + posted-at = {2008-06-08 04:09:56}, + title = {{On-Line Construction of Suffix Trees}}, + volume = {14}, + year = {1995} +} + +@misc{javolution, + title = {{javolution - The Java \texttrademark Solution for Real-Time and Embedded Systems}}, + url = {{http://javolution.org/}} +} + +@misc{guava, + author = {{Google, inc.}}, + title = {{guava-libraries}}, + url = {{http://code.google.com/p/guava-libraries/}} +} + +@misc{hamcrest, + title = {{Hamcrest - library of matchers for building test expressions}}, + url = {{http://code.google.com/p/hamcrest/}} +} + +@misc{testng, + title = {{TestNG}}, + url = {{http://www.testng.org}} +} + +@misc{caliper, + author = {{Google, inc.}}, + title = {{Caliper - Google's open-source framework for writing, running and viewing the results of Java Microbenchmarks}}, + url = {{http://code.google.com/p/caliper/}} +} \ No newline at end of file diff --git a/INSTALL b/INSTALL deleted file mode 100644 index d7cc29f..0000000 --- a/INSTALL +++ /dev/null @@ -1,5 +0,0 @@ -To build, please install an up-to-date version of maven and run the following commands. -In the project root directory: mvn clean install -Then, in the assembly directory: mvn clean install assembly:single -This will generate a tar.gz and a .zip containing the diffr jar, and a bash script each for diff and patch. - diff --git a/README b/README deleted file mode 100644 index 183d85f..0000000 --- a/README +++ /dev/null @@ -1,2 +0,0 @@ -Intelligent DIFF/PATCH tool that knows copy and move, and has an 'R' at the end of its name. - diff --git a/assembly/pom.xml b/assembly/pom.xml deleted file mode 100644 index 9da8f42..0000000 --- a/assembly/pom.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - assembly - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Assembles diffr and patchr into a single jar. - - - - diffr - diff - ${current.version} - - - diffr - patch - ${current.version} - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 1.6 - - - package - - shade - - - - - - org.apache.maven.plugins - maven-assembly-plugin - ${assembly.version} - - - src/main/assembly/assembly.xml - - - - - - diff --git a/assembly/src/main/assembly/assembly.xml b/assembly/src/main/assembly/assembly.xml deleted file mode 100644 index 190f656..0000000 --- a/assembly/src/main/assembly/assembly.xml +++ /dev/null @@ -1,31 +0,0 @@ - - diffr-assembly - - zip - tar.gz - - false - - - true - - diffr:assembly:jar - - - - - - - / - src/main/bin/diffr.sh - 755 - - - / - src/main/bin/patchr.sh - 755 - - - diff --git a/assembly/src/main/bin/diffr.sh b/assembly/src/main/bin/diffr.sh deleted file mode 100644 index 5403782..0000000 --- a/assembly/src/main/bin/diffr.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -java -cp "*" diffr.diff.Main $@ diff --git a/assembly/src/main/bin/patchr.sh b/assembly/src/main/bin/patchr.sh deleted file mode 100644 index f4ef357..0000000 --- a/assembly/src/main/bin/patchr.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -java -cp "*" diffr.patch.Main $@ diff --git a/conclusion/conclusion.tex b/conclusion/conclusion.tex new file mode 100644 index 0000000..1f5fc94 --- /dev/null +++ b/conclusion/conclusion.tex @@ -0,0 +1,7 @@ +\section{Conclusion} + +Overall, the project was a success. The team as a whole had the opportunity to use tools that allowed us to streamline our work quite effectively. As some members were more familiar than others on a particular tool or technology, communication was essential. We relied heavily on \texttt{bitbucket.org} to store code, track and assign issues to each other. A post-meeting email was systematically sent with goals for each member of the team, even when all team members were physically present at the meeting. + +The final sprints were conducted when all team members were physically separated in locations that spanned 6 different timezones (from the East Coast USA to Poland). This proves that while distance certainly can hinder efficiency, remote development teams can still function well. Correct and up-to-date commit messages, distributed version control, and frequent communication were key to overcoming glitches and ensuring our success. + +In this project we have learned about how \texttt{DIFF} and \texttt{PATCH} tools work, and why they are so important. We have also gained experience in working with clone detection techniques such as suffix trees, which proved a very effective addition to the \texttt{diffr} tool. The tools we produced performed well against the oft-used \texttt{GNU DIFF}; the result was a slightly slower, but well scaling tool that outputs significantly smaller patch files. As an additional challenge, the group plans to port this tool to \texttt{C/C++} over the summer, in order to improve the runtime. diff --git a/design/design.tex b/design/design.tex new file mode 100644 index 0000000..9e82632 --- /dev/null +++ b/design/design.tex @@ -0,0 +1,45 @@ +\section{UML Diagram} + +Below are the UML diagrams for each module in the system, and one to show the interactions between them. + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{design/diffrUML-util.png} +\end{center} +\caption{UML diagram for the Util module.} +\label{fig:utilUML} +\end{figure} + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{design/diffrUML-suffixtree.png} +\end{center} +\caption{UML diagram for the SuffixTree module.} +\label{fig:suffixTreeUML} +\end{figure} + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{design/diffrUML-diffR.png} +\end{center} +\caption{UML diagram for the DiffR module.} +\label{fig:diffrUML} +\end{figure} + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{design/diffrUML-patchR.png} +\end{center} +\caption{UML diagram for the PatchR module.} +\label{fig:patchrUML} +\end{figure} + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{design/diffrUML-packages.png} +\end{center} +\caption{UML diagram to show module interaction.} +\label{fig:packagesUML} +\end{figure} + + diff --git a/design/diffRUML-diffR.png b/design/diffRUML-diffR.png new file mode 100644 index 0000000..fbe0e52 Binary files /dev/null and b/design/diffRUML-diffR.png differ diff --git a/design/diffRUML-packages.png b/design/diffRUML-packages.png new file mode 100644 index 0000000..4fc9839 Binary files /dev/null and b/design/diffRUML-packages.png differ diff --git a/design/diffRUML-patchR.png b/design/diffRUML-patchR.png new file mode 100644 index 0000000..9b475b3 Binary files /dev/null and b/design/diffRUML-patchR.png differ diff --git a/design/diffRUML-suffixtree.png b/design/diffRUML-suffixtree.png new file mode 100644 index 0000000..cc026cf Binary files /dev/null and b/design/diffRUML-suffixtree.png differ diff --git a/design/diffRUML-util.png b/design/diffRUML-util.png new file mode 100644 index 0000000..1049931 Binary files /dev/null and b/design/diffRUML-util.png differ diff --git a/diff/pom.xml b/diff/pom.xml deleted file mode 100644 index 3bb6e95..0000000 --- a/diff/pom.xml +++ /dev/null @@ -1,56 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - diff - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Diff for diffr. - - - - diffr - suffix-tree - ${current.version} - - - diffr - util - ${current.version} - - - com.google.guava - guava - ${guava.version} - - - diffr - util - ${current.version} - test - tests - - - org.testng - testng - ${testng.version} - test - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - test - - - diff --git a/diff/src/main/java/diffr/diff/Diffr.java b/diff/src/main/java/diffr/diff/Diffr.java deleted file mode 100644 index 9600711..0000000 --- a/diff/src/main/java/diffr/diff/Diffr.java +++ /dev/null @@ -1,71 +0,0 @@ -package diffr.diff; - -import com.google.common.collect.Lists; -import diffr.suffixtree.SuffixTree; -import diffr.suffixtree.SuffixTree.Matcher; -import diffr.suffixtree.SuffixTrees; -import diffr.util.instruction.CopyInstruction; -import diffr.util.instruction.InsertInstruction; -import diffr.util.instruction.Instruction; - -import java.util.List; - -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Generates a list of {@link Instruction}s to transform original file into a new file. - * - * @author Sarina Gurung - * @author Jakub D Kozlowski - * @since 0.3 - */ -public final class Diffr { - - private final List originalFile; - - private final List newFile; - - /** - * Default constructor. - * - * @param originalFile original file to be transform. - * @param newFile new file to transform {@code originalFile} to. - * - * @throws NullPointerException if any parameter is null. - */ - public Diffr(final List originalFile, final List newFile) { - this.originalFile = checkNotNull(originalFile); - this.newFile = checkNotNull(newFile); - } - - /** - * Gets the list of {@link Instruction}s to transform {@code originalFile} to {@code newFile}. - * - * @return list of {@link Instruction}s. - */ - public List diff() { - - final List instructions = Lists.newArrayList(); - final SuffixTree suffixTree = SuffixTrees.newSuffixTree(this.originalFile); - - Matcher matcher = suffixTree.matcher(); - - for (final String newFileLine : newFile) { - - if (!matcher.matchNext(newFileLine).isMatched()) { - if (!matcher.isRoot()) { - instructions.add(new CopyInstruction(matcher.range())); - } - instructions.add(new InsertInstruction(newFileLine)); - matcher = suffixTree.matcher(); - } - } - - if (!matcher.isRoot()) { - instructions.add(new CopyInstruction(matcher.range())); - } - - return instructions; - } - -} diff --git a/diff/src/main/java/diffr/diff/Main.java b/diff/src/main/java/diffr/diff/Main.java deleted file mode 100644 index 2555a76..0000000 --- a/diff/src/main/java/diffr/diff/Main.java +++ /dev/null @@ -1,112 +0,0 @@ -package diffr.diff; - -import com.google.common.base.Optional; -import com.google.common.io.Files; -import diffr.util.ArgumentsProcessor; -import diffr.util.instruction.Instruction; -import diffr.util.instruction.InstructionComposer; -import diffr.util.instruction.Instructions; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.List; - -/** - * Main entry point to diffr's DIFF tool. - *

- *

- * Expects two arguments: - *

    - *
  • <original-file> - The original file to diff.
  • - *
  • <new-file> - The new version of the original file to diff.
  • - *
- *

- * - * @author Jakub D Kozlowski - * @author Sarina Gurung - * @since 0.1 - */ -public final class Main { - - /** - * Prints the usage of this tool. - */ - private static void printUsage() { - System.out.println("Usage: \n" + - " diffr \n" + - " diffr -o "); - } - - /** - * Runs the diff tool on two files. - * - * @param args arguments to this tool. - * - * @return exit code. - */ - public static int run(String... args) { - try { - if (ArgumentsProcessor.containsHelpArgument(args) - || (2 != args.length - && 4 != args.length)) { - printUsage(); - return -1; - } - - final File firstFile = new File(args[0]); - final File secondFile = new File(args[1]); - - if (!firstFile.exists()) { - System.err.println("File " + firstFile + " not found."); - return -1; - } - - if (!secondFile.exists()) { - System.err.println("File " + secondFile + " not found."); - return -1; - } - - final List originalFile = Files.readLines(firstFile, Charset.defaultCharset()); - final List newFile = Files.readLines(secondFile, Charset.defaultCharset()); - - final List instructions = new Diffr(originalFile, newFile).diff(); - - final Optional outputFile = ArgumentsProcessor.extractOutputFile(args); - - if (4 == args.length - && outputFile.isPresent()) { - - final File file = new File(outputFile.get()); - final BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file)); - for (final Instruction instruction : instructions) { - Instructions.writeInstruction(instruction, bufferedWriter); - } - bufferedWriter.close(); - } - else { - for (final Instruction instruction : instructions) { - System.out.println(InstructionComposer.composeString(instruction)); - } - System.out.flush(); - } - - return 0; - } - catch (final IOException io) { - System.err.println("There was a problem reading the files: " + io); - return -1; - } - } - - /** - * Invokes {@link #run(String...)} and calls {@link System#exit(int)}. - * - * @param args arguments to this tool. - */ - public static void main(String... args) { - System.exit(run(args)); - } -} diff --git a/diff/src/test/java/diffr/diff/DiffrTest.java b/diff/src/test/java/diffr/diff/DiffrTest.java deleted file mode 100644 index 356fb0c..0000000 --- a/diff/src/test/java/diffr/diff/DiffrTest.java +++ /dev/null @@ -1,95 +0,0 @@ -package diffr.diff; - -import com.google.common.collect.Lists; -import com.google.common.io.Files; -import com.google.common.io.LineProcessor; -import com.google.common.io.Resources; -import diffr.util.instruction.Instruction; -import diffr.util.instruction.InstructionComposer; -import diffr.util.instruction.InstructionParser; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.Charset; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link Diffr}. - * - * @author Sarina Gurung - * @author Jakub D Kozlowski - * @since 0.3 - */ -public class DiffrTest { - - private static final String DEFAULT_PROVIDER = "default-provider"; - - @DataProvider(name = "default-provider") - public Object[][] getFiles() throws URISyntaxException, IOException { - - final List files = Lists.newArrayList(); - - final File originalDir = new File(Resources.getResource("original").toURI()); - for (final File originalFile : originalDir.listFiles()) { - final File newFile = new File(Resources.getResource("new/" + originalFile.getName()).toURI()); - final File patchFile = new File(Resources.getResource("patch/" + originalFile.getName()).toURI()); - files.add(new Object[]{ - Files.readLines(originalFile, Charset.defaultCharset()), - Files.readLines(newFile, Charset.defaultCharset()), - Files.readLines(patchFile, Charset.defaultCharset(), new LineProcessor>() { - - private List instructions = Lists.newArrayList(); - - @Override - public boolean processLine(String s) throws IOException { - instructions.add(InstructionParser.parseInstruction(s).get()); - return true; - } - - @Override - public List getResult() { - return instructions; - } - }) - }); - } - - return files.toArray(new Object[][]{}); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testConstructorNullOriginalFile() { - new Diffr(null, Collections.EMPTY_LIST); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testConstructorNullNewFile() { - new Diffr(Collections.EMPTY_LIST, null); - } - - @Test(dataProvider = DEFAULT_PROVIDER) - public void testDiff(final List originalFile, - final List newFile, - final List patchFile) - throws IOException, URISyntaxException { - - - final Diffr d = new Diffr(originalFile, newFile); - - final Iterator actualInstructions = d.diff().iterator(); - - for (final Instruction expected : patchFile) { - final Instruction actual = actualInstructions.next(); - assertThat(InstructionComposer.composeString(actual), - is(InstructionComposer.composeString(expected))); - } - } -} diff --git a/diff/src/test/resources/new/test1 b/diff/src/test/resources/new/test1 deleted file mode 100644 index 20e4817..0000000 --- a/diff/src/test/resources/new/test1 +++ /dev/null @@ -1,5 +0,0 @@ -One -Two -Three -Five -Seven \ No newline at end of file diff --git a/diff/src/test/resources/new/test2 b/diff/src/test/resources/new/test2 deleted file mode 100644 index 2ab6864..0000000 --- a/diff/src/test/resources/new/test2 +++ /dev/null @@ -1,7 +0,0 @@ -One -Two -Three -Five -Seven -Eight -Nine \ No newline at end of file diff --git a/diff/src/test/resources/new/test3 b/diff/src/test/resources/new/test3 deleted file mode 100644 index c86d140..0000000 --- a/diff/src/test/resources/new/test3 +++ /dev/null @@ -1,8 +0,0 @@ -One -Two -Three -Five -Seven -Eight -Nine -Ten \ No newline at end of file diff --git a/diff/src/test/resources/new/test4 b/diff/src/test/resources/new/test4 deleted file mode 100644 index 2b10312..0000000 --- a/diff/src/test/resources/new/test4 +++ /dev/null @@ -1,10 +0,0 @@ -One -Two -Three -Five -Seven -Eight -Nine -Ten -Twelve -Thirteen \ No newline at end of file diff --git a/diff/src/test/resources/new/test5 b/diff/src/test/resources/new/test5 deleted file mode 100644 index 582b92b..0000000 --- a/diff/src/test/resources/new/test5 +++ /dev/null @@ -1,11 +0,0 @@ -One -Two -Three -Five -Seven -Eight -Nine -Ten -Twelve -Thirteen -Fifteen \ No newline at end of file diff --git a/diff/src/test/resources/new/test6 b/diff/src/test/resources/new/test6 deleted file mode 100644 index 40816a2..0000000 --- a/diff/src/test/resources/new/test6 +++ /dev/null @@ -1 +0,0 @@ -Hi \ No newline at end of file diff --git a/diff/src/test/resources/new/test7 b/diff/src/test/resources/new/test7 deleted file mode 100644 index 40816a2..0000000 --- a/diff/src/test/resources/new/test7 +++ /dev/null @@ -1 +0,0 @@ -Hi \ No newline at end of file diff --git a/diff/src/test/resources/new/test9 b/diff/src/test/resources/new/test9 deleted file mode 100644 index c7c85c9..0000000 --- a/diff/src/test/resources/new/test9 +++ /dev/null @@ -1,7 +0,0 @@ -January -February -March - -April -May -June \ No newline at end of file diff --git a/diff/src/test/resources/original/test1 b/diff/src/test/resources/original/test1 deleted file mode 100644 index 2402b58..0000000 --- a/diff/src/test/resources/original/test1 +++ /dev/null @@ -1,5 +0,0 @@ -One -Two -Three -Four -Six \ No newline at end of file diff --git a/diff/src/test/resources/original/test2 b/diff/src/test/resources/original/test2 deleted file mode 100644 index ba63efe..0000000 --- a/diff/src/test/resources/original/test2 +++ /dev/null @@ -1,7 +0,0 @@ -One -Two -Three -Four -Six -Eight -Nine \ No newline at end of file diff --git a/diff/src/test/resources/original/test3 b/diff/src/test/resources/original/test3 deleted file mode 100644 index 945d8c5..0000000 --- a/diff/src/test/resources/original/test3 +++ /dev/null @@ -1,8 +0,0 @@ -Some other line -Two -Three -Four -Six -Eight -Nine -Elevan diff --git a/diff/src/test/resources/original/test4 b/diff/src/test/resources/original/test4 deleted file mode 100644 index f790a2e..0000000 --- a/diff/src/test/resources/original/test4 +++ /dev/null @@ -1,10 +0,0 @@ -One -Two -Three -Four -Six -Eight -Nine -Elevan -Twelve -Thirteen \ No newline at end of file diff --git a/diff/src/test/resources/original/test5 b/diff/src/test/resources/original/test5 deleted file mode 100644 index abfb0ce..0000000 --- a/diff/src/test/resources/original/test5 +++ /dev/null @@ -1,11 +0,0 @@ -One -Two -Three -Four -Six -Eight -Nine -Elevan -Twelve -Thirteen -Fourteen \ No newline at end of file diff --git a/diff/src/test/resources/original/test6 b/diff/src/test/resources/original/test6 deleted file mode 100644 index 5ab2f8a..0000000 --- a/diff/src/test/resources/original/test6 +++ /dev/null @@ -1 +0,0 @@ -Hello \ No newline at end of file diff --git a/diff/src/test/resources/original/test7 b/diff/src/test/resources/original/test7 deleted file mode 100644 index 40816a2..0000000 --- a/diff/src/test/resources/original/test7 +++ /dev/null @@ -1 +0,0 @@ -Hi \ No newline at end of file diff --git a/diff/src/test/resources/original/test9 b/diff/src/test/resources/original/test9 deleted file mode 100644 index 6d45f45..0000000 --- a/diff/src/test/resources/original/test9 +++ /dev/null @@ -1,7 +0,0 @@ -April -May -June - -April -May -June \ No newline at end of file diff --git a/diff/src/test/resources/patch/test1 b/diff/src/test/resources/patch/test1 deleted file mode 100644 index 8ef175c..0000000 --- a/diff/src/test/resources/patch/test1 +++ /dev/null @@ -1,3 +0,0 @@ -0,2 -> Five -> Seven diff --git a/diff/src/test/resources/patch/test2 b/diff/src/test/resources/patch/test2 deleted file mode 100644 index 237a1c9..0000000 --- a/diff/src/test/resources/patch/test2 +++ /dev/null @@ -1,4 +0,0 @@ -0,2 -> Five -> Seven -5,6 \ No newline at end of file diff --git a/diff/src/test/resources/patch/test3 b/diff/src/test/resources/patch/test3 deleted file mode 100644 index f5d419f..0000000 --- a/diff/src/test/resources/patch/test3 +++ /dev/null @@ -1,6 +0,0 @@ -> One -1,2 -> Five -> Seven -5,6 -> Ten diff --git a/diff/src/test/resources/patch/test4 b/diff/src/test/resources/patch/test4 deleted file mode 100644 index cac32de..0000000 --- a/diff/src/test/resources/patch/test4 +++ /dev/null @@ -1,6 +0,0 @@ -0,2 -> Five -> Seven -5,6 -> Ten -8,9 \ No newline at end of file diff --git a/diff/src/test/resources/patch/test5 b/diff/src/test/resources/patch/test5 deleted file mode 100644 index ff337de..0000000 --- a/diff/src/test/resources/patch/test5 +++ /dev/null @@ -1,7 +0,0 @@ -0,2 -> Five -> Seven -5,6 -> Ten -8,9 -> Fifteen \ No newline at end of file diff --git a/diff/src/test/resources/patch/test6 b/diff/src/test/resources/patch/test6 deleted file mode 100644 index c7eb67a..0000000 --- a/diff/src/test/resources/patch/test6 +++ /dev/null @@ -1 +0,0 @@ -> Hi \ No newline at end of file diff --git a/diff/src/test/resources/patch/test7 b/diff/src/test/resources/patch/test7 deleted file mode 100644 index 7de346d..0000000 --- a/diff/src/test/resources/patch/test7 +++ /dev/null @@ -1 +0,0 @@ -0,0 \ No newline at end of file diff --git a/diff/src/test/resources/patch/test9 b/diff/src/test/resources/patch/test9 deleted file mode 100644 index 71432c4..0000000 --- a/diff/src/test/resources/patch/test9 +++ /dev/null @@ -1,4 +0,0 @@ -> January -> February -> March -3,6 \ No newline at end of file diff --git a/implementation/implementation.tex b/implementation/implementation.tex new file mode 100644 index 0000000..31305f0 --- /dev/null +++ b/implementation/implementation.tex @@ -0,0 +1,47 @@ +\section{Implementation} + +In this chapter we will describe the overall architecture of \texttt{diffr} and \texttt{patchr} and provide a few implementation details. Then we will move on to describe the tools we used during the implementation. + +\subsection{Modules} + +\paragraph{diffr.suffix-tree (suffix-tree/)} +This module contains the Suffix Tree implementation. It is a generic Suffix Tree based on the implementation suggested in \cite{Ukkonen95} and optimised for quickly matching suffixes of elements. The implementation details are completely hidden from the user behind the \\ \texttt{diffr.suffixtree.SuffixTree} interface and \\ \texttt{diffr.suffixtree.SuffixTrees} factory. Using the \texttt{SuffixTree} for matching sequences of elements can be accomplished through an implementation of \texttt{diffr.suffixtree.SuffixTree.Matcher} interface returned from \\ \texttt{SuffixTree\#matcher()} method. Internally the Suffix Tree uses high-performance, real-time \texttt{java.util.List} and \texttt{java.util.Map} implementations from the \texttt{javolution library} (\texttt{javolution.util.FastTable} and \texttt{javolution.util.FastMap})~\cite{javolution}. + +The hashcode of each of line of the first document is computed and used first before a deep comparison of lines when attempting to find clones in the tree. +This improves performance slightly. + +\paragraph{diffr.util (util/)} +This module contains various domain objects and utility classes. The main classes that encapsulate the two possible instructions output by \texttt{diffr} and are located in the \texttt{diffr.util.instruction} package. Also in this package, we have implemented classes that deal with transforming instructions to/from text and writing them to streams. + +\paragraph{diffr.patch (patch/)} +This module contains the patch implementation. The algorithm is implemented in \texttt{diffr.patch.Patchr}: it reads both the original file and the patch file into memory. In then uses the classes from the \texttt{util/} module to parse and validate the patch file for existence of incorrect instructions and terminates with an error message if the validation fails. It then iterates through instructions and transforms them into appropriate text. The instruction text is collected in a list and returned. It is the \texttt{diffr.patch.Main} class that outputs the transformed file. This separation of concerns allows us to choose between writing to file and standard output, depending on a flag specified by the user. + +Additionally, the patch tool will exit and print a relevant error message if one of the files cannot be read, or if any sort of exception is thrown. + +\paragraph{diffr.diff (diff/)} + +This module contains the diff implementation. The algorithm is implemented in \texttt{diffr.diff.Diffr}: it builds the \texttt{SuffixTree} or the original file and then iterates through the new file in order to collect in a list the longest sequences of clones and holes between the two files, and return this list. Similarly to \texttt{patch/}, it is the \texttt{diffr.diff.Main} class that outputs the instructions. Again, this separation of concerns allows us to choose between writing to file and standard output, depending on a flag specified by the user. + +Also, relevant error messages are printed if any error condition occurs. + +\paragraph{diffr.assembly (assembly/)} +This module builds a jar file with all the \texttt{diffr.patch}, \texttt{diffr.diff} classes and their dependencies. The jar file is then aggregated together with bash scripts for running \texttt{diff} and \texttt{patch} in \texttt{.zip} and \texttt{.tar.gz} archives. + +\paragraph{diffr.integration-tests (integration-tests/)} +This module contains the integration tests, further described in~\Cref{IntegrationTesting}. + +\subsection{Tools} + +\paragraph{Build Management} + +We used \textit{Maven3} as our build tool. The main advantage of \textit{Maven3} over the more traditional \textit{Ant} is automatic dependency management and default build configuration that suits most of the projects well. + +\paragraph{Version Control} +We decided to use \textit{git} as our version control system, as most of our group were already familiar with it. \texttt{git} is great for doing distributed, offline development and the first-class support for branching means we can all safely work in separate branches and freely share code, without polluting the history in the main branch. We also decided to use \textit{bitbucket.org} to host our repository due to the built-in support for issues and pull requests, that we used extensively for planning iterations, tracking tasks and code review. + +\paragraph{IDE} +Because we used \textit{Maven3} as our build tool, our team members were free to choose any IDE they wished. Our team members used \textit{IntelliJ IDEA} and \textit{Eclipse}. + +\paragraph{Libraries} +We mainly used two open source libraries in the production code: \texttt{javolution}~\cite{javolution} and \texttt{guava-libraries}~\cite{guava}. The already mentioned~\cite{javolution} provided high-performance, real-time replacements for \texttt{Java Collections} classes and~\cite{guava} useful utilities for idiomatically reading/writing files, validating input etc. + diff --git a/integration-tests/pom.xml b/integration-tests/pom.xml deleted file mode 100644 index e27b375..0000000 --- a/integration-tests/pom.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - integration-tests - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Integration tests for patchr and diffr. - - - - diffr - util - ${current.version} - test - tests - - - diffr - diff - ${current.version} - test - - - diffr - patch - ${current.version} - test - - - org.testng - testng - ${testng.version} - test - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - test - - - diff --git a/integration-tests/src/test/java/diffr/integration/DiffPatchIntegrationTest.java b/integration-tests/src/test/java/diffr/integration/DiffPatchIntegrationTest.java deleted file mode 100644 index 539275f..0000000 --- a/integration-tests/src/test/java/diffr/integration/DiffPatchIntegrationTest.java +++ /dev/null @@ -1,91 +0,0 @@ -package diffr.integration; - -import com.google.common.io.Files; -import com.google.common.io.Resources; -import diffr.patch.IllegalPatchFileException; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.net.URISyntaxException; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests diff/patch integration. - * - * @author Jakub D Kozlowski - * @since 1.0 - */ -public class DiffPatchIntegrationTest { - - @Test - public void testKernel01ToKernel26() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel01.txt", "kernel26.txt"); - } - - @Test - public void testKernel26ToKernel01() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel26.txt", "kernel01.txt"); - } - - @Test - public void testKernel01ToKernel33() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel01.txt", "kernel33.txt"); - } - - @Test - public void testKernel33ToKernel01() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel33.txt", "kernel01.txt"); - } - - @Test - public void testKernel26ToKernel33() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel26.txt", "kernel33.txt"); - } - - @Test - public void testKernel33ToKernel26() throws IllegalPatchFileException, URISyntaxException, IOException { - testDiffrPatchr("kernel33.txt", "kernel26.txt"); - } - - /** - * Runs diffr on {@code originalFileName} and {@code newFileName}, runs patchr on the resulting patch file and - * {@code originalFileName} and compares the result to {@code newFileName}. - * - * @param originalFileName file name of the original file. - * @param newFileName file name of the new file. - * - * @throws IOException if there is a problem reading or writing the files. - * @throws URISyntaxException if the file names cannot be found. - */ - public static void testDiffrPatchr(final String originalFileName, final String newFileName) - throws IOException, URISyntaxException { - - final File originalFile = getFile(originalFileName); - final File newFile = getFile(newFileName); - - final File tmpPatchFile = File.createTempFile("diffr", "patch", Files.createTempDir()); - final File tmpNewFile = File.createTempFile("diffr", "new", Files.createTempDir()); - - diffr.diff.Main.run(originalFile.getAbsolutePath(), newFile.getAbsolutePath(), "-o", - tmpPatchFile.getAbsolutePath()); - - diffr.patch.Main.run(originalFile.getAbsolutePath(), tmpPatchFile.getAbsolutePath(), "-o", - tmpNewFile.getAbsolutePath()); - - assertThat(Files.equal(newFile, tmpNewFile), is(true)); - } - - /** - * Gets the {@code fileName} from the classloader. - * - * @param fileName name of file to get. - * - * @return {@code fileName} from the classloader. - */ - public static File getFile(final String fileName) throws URISyntaxException, IOException { - return new File(Resources.getResource(fileName).toURI()); - } -} diff --git a/integration-tests/src/test/resources/kernel01.txt b/integration-tests/src/test/resources/kernel01.txt deleted file mode 100644 index 26d2bda..0000000 --- a/integration-tests/src/test/resources/kernel01.txt +++ /dev/null @@ -1,2585 +0,0 @@ -/* - * console.c - * - * This module implements the console io functions - * 'void con_init(void)' - * 'void con_write(struct tty_queue * queue)' - * Hopefully this will be a rather complete VT102 implementation. - * - */ - -/* - * NOTE!!! We sometimes disable and enable interrupts for a short while - * (to put a word in video IO), but this will work even for keyboard - * interrupts. We know interrupts aren't enabled when getting a keyboard - * interrupt, as we use trap-gates. Hopefully all is well. - */ - -#include -#include -#include -#include - -#define SCREEN_START 0xb8000 -#define SCREEN_END 0xc0000 -#define LINES 25 -#define COLUMNS 80 -#define NPAR 16 - -extern void keyboard_interrupt(void); - -static unsigned long origin=SCREEN_START; -static unsigned long scr_end=SCREEN_START+LINES*COLUMNS*2; -static unsigned long pos; -static unsigned long x,y; -static unsigned long top=0,bottom=LINES; -static unsigned long lines=LINES,columns=COLUMNS; -static unsigned long state=0; -static unsigned long npar,par[NPAR]; -static unsigned long ques=0; -static unsigned char attr=0x07; - -/* - * this is what the terminal answers to a ESC-Z or csi0c - * query (= vt100 response). - */ -#define RESPONSE "\033[?1;2c" - -static inline void gotoxy(unsigned int new_x,unsigned int new_y) -{ - if (new_x>=columns || new_y>=lines) - return; - x=new_x; - y=new_y; - pos=origin+((y*columns+x)<<1); -} - -static inline void set_origin(void) -{ - cli(); - outb_p(12,0x3d4); - outb_p(0xff&((origin-SCREEN_START)>>9),0x3d5); - outb_p(13,0x3d4); - outb_p(0xff&((origin-SCREEN_START)>>1),0x3d5); - sti(); -} - -static void scrup(void) -{ - if (!top && bottom==lines) { - origin += columns<<1; - pos += columns<<1; - scr_end += columns<<1; - if (scr_end>SCREEN_END) { - __asm__("cld\n\t" - "rep\n\t" - "movsl\n\t" - "movl _columns,%1\n\t" - "rep\n\t" - "stosw" - ::"a" (0x0720), - "c" ((lines-1)*columns>>1), - "D" (SCREEN_START), - "S" (origin) - :"cx","di","si"); - scr_end -= origin-SCREEN_START; - pos -= origin-SCREEN_START; - origin = SCREEN_START; - } else { - __asm__("cld\n\t" - "rep\n\t" - "stosl" - ::"a" (0x07200720), - "c" (columns>>1), - "D" (scr_end-(columns<<1)) - :"cx","di"); - } - set_origin(); - } else { - __asm__("cld\n\t" - "rep\n\t" - "movsl\n\t" - "movl _columns,%%ecx\n\t" - "rep\n\t" - "stosw" - ::"a" (0x0720), - "c" ((bottom-top-1)*columns>>1), - "D" (origin+(columns<<1)*top), - "S" (origin+(columns<<1)*(top+1)) - :"cx","di","si"); - } -} - -static void scrdown(void) -{ - __asm__("std\n\t" - "rep\n\t" - "movsl\n\t" - "addl $2,%%edi\n\t" /* %edi has been decremented by 4 */ - "movl _columns,%%ecx\n\t" - "rep\n\t" - "stosw" - ::"a" (0x0720), - "c" ((bottom-top-1)*columns>>1), - "D" (origin+(columns<<1)*bottom-4), - "S" (origin+(columns<<1)*(bottom-1)-4) - :"ax","cx","di","si"); -} - -static void lf(void) -{ - if (y+1top) { - y--; - pos -= columns<<1; - return; - } - scrdown(); -} - -static void cr(void) -{ - pos -= x<<1; - x=0; -} - -static void del(void) -{ - if (x) { - pos -= 2; - x--; - *(unsigned short *)pos = 0x0720; - } -} - -static void csi_J(int par) -{ - long count __asm__("cx"); - long start __asm__("di"); - - switch (par) { - case 0: /* erase from cursor to end of display */ - count = (scr_end-pos)>>1; - start = pos; - break; - case 1: /* erase from start to cursor */ - count = (pos-origin)>>1; - start = origin; - break; - case 2: /* erase whole display */ - count = columns*lines; - start = origin; - break; - default: - return; - } - __asm__("cld\n\t" - "rep\n\t" - "stosw\n\t" - ::"c" (count), - "D" (start),"a" (0x0720) - :"cx","di"); -} - -static void csi_K(int par) -{ - long count __asm__("cx"); - long start __asm__("di"); - - switch (par) { - case 0: /* erase from cursor to end of line */ - if (x>=columns) - return; - count = columns-x; - start = pos; - break; - case 1: /* erase from start of line to cursor */ - start = pos - (x<<1); - count = (x>9),0x3d5); - outb_p(15,0x3d4); - outb_p(0xff&((pos-SCREEN_START)>>1),0x3d5); - sti(); -} - -static void respond(struct tty_struct * tty) -{ - char * p = RESPONSE; - - cli(); - while (*p) { - PUTCH(*p,tty->read_q); - p++; - } - sti(); - copy_to_cooked(tty); -} - -static void insert_char(void) -{ - int i=x; - unsigned short tmp,old=0x0720; - unsigned short * p = (unsigned short *) pos; - - while (i++=columns) - return; - i = x; - while (++i < columns) { - *p = *(p+1); - p++; - } - *p=0x0720; -} - -static void delete_line(void) -{ - int oldtop,oldbottom; - - oldtop=top; - oldbottom=bottom; - top=y; - bottom=lines; - scrup(); - top=oldtop; - bottom=oldbottom; -} - -static void csi_at(int nr) -{ - if (nr>columns) - nr=columns; - else if (!nr) - nr=1; - while (nr--) - insert_char(); -} - -static void csi_L(int nr) -{ - if (nr>lines) - nr=lines; - else if (!nr) - nr=1; - while (nr--) - insert_line(); -} - -static void csi_P(int nr) -{ - if (nr>columns) - nr=columns; - else if (!nr) - nr=1; - while (nr--) - delete_char(); -} - -static void csi_M(int nr) -{ - if (nr>lines) - nr=lines; - else if (!nr) - nr=1; - while (nr--) - delete_line(); -} - -static int saved_x=0; -static int saved_y=0; - -static void save_cur(void) -{ - saved_x=x; - saved_y=y; -} - -static void restore_cur(void) -{ - x=saved_x; - y=saved_y; - pos=origin+((y*columns+x)<<1); -} - -void con_write(struct tty_struct * tty) -{ - int nr; - char c; - - nr = CHARS(tty->write_q); - while (nr--) { - GETCH(tty->write_q,c); - switch(state) { - case 0: - if (c>31 && c<127) { - if (x>=columns) { - x -= columns; - pos -= columns<<1; - lf(); - } - __asm__("movb _attr,%%ah\n\t" - "movw %%ax,%1\n\t" - ::"a" (c),"m" (*(short *)pos) - :"ax"); - pos += 2; - x++; - } else if (c==27) - state=1; - else if (c==10 || c==11 || c==12) - lf(); - else if (c==13) - cr(); - else if (c==ERASE_CHAR(tty)) - del(); - else if (c==8) { - if (x) { - x--; - pos -= 2; - } - } else if (c==9) { - c=8-(x&7); - x += c; - pos += c<<1; - if (x>columns) { - x -= columns; - pos -= columns<<1; - lf(); - } - c=9; - } - break; - case 1: - state=0; - if (c=='[') - state=2; - else if (c=='E') - gotoxy(0,y+1); - else if (c=='M') - ri(); - else if (c=='D') - lf(); - else if (c=='Z') - respond(tty); - else if (x=='7') - save_cur(); - else if (x=='8') - restore_cur(); - break; - case 2: - for(npar=0;npar='0' && c<='9') { - par[npar]=10*par[npar]+c-'0'; - break; - } else state=4; - case 4: - state=0; - switch(c) { - case 'G': case '`': - if (par[0]) par[0]--; - gotoxy(par[0],y); - break; - case 'A': - if (!par[0]) par[0]++; - gotoxy(x,y-par[0]); - break; - case 'B': case 'e': - if (!par[0]) par[0]++; - gotoxy(x,y+par[0]); - break; - case 'C': case 'a': - if (!par[0]) par[0]++; - gotoxy(x+par[0],y); - break; - case 'D': - if (!par[0]) par[0]++; - gotoxy(x-par[0],y); - break; - case 'E': - if (!par[0]) par[0]++; - gotoxy(0,y+par[0]); - break; - case 'F': - if (!par[0]) par[0]++; - gotoxy(0,y-par[0]); - break; - case 'd': - if (par[0]) par[0]--; - gotoxy(x,par[0]); - break; - case 'H': case 'f': - if (par[0]) par[0]--; - if (par[1]) par[1]--; - gotoxy(par[1],par[0]); - break; - case 'J': - csi_J(par[0]); - break; - case 'K': - csi_K(par[0]); - break; - case 'L': - csi_L(par[0]); - break; - case 'M': - csi_M(par[0]); - break; - case 'P': - csi_P(par[0]); - break; - case '@': - csi_at(par[0]); - break; - case 'm': - csi_m(); - break; - case 'r': - if (par[0]) par[0]--; - if (!par[1]) par[1]=lines; - if (par[0] < par[1] && - par[1] <= lines) { - top=par[0]; - bottom=par[1]; - } - break; - case 's': - save_cur(); - break; - case 'u': - restore_cur(); - break; - } - } - } - set_cursor(); -} - -/* - * void con_init(void); - * - * This routine initalizes console interrupts, and does nothing - * else. If you want the screen to clear, call tty_write with - * the appropriate escape-sequece. - */ -void con_init(void) -{ - register unsigned char a; - - gotoxy(*(unsigned char *)(0x90000+510),*(unsigned char *)(0x90000+511)); - set_trap_gate(0x21,&keyboard_interrupt); - outb_p(inb_p(0x21)&0xfd,0x21); - a=inb_p(0x61); - outb_p(a|0x80,0x61); - outb(a,0x61); -} -#include -#include -#include - -#include -#include -#include -#include - -int sys_pause(void); -int sys_close(int fd); - -void release(struct task_struct * p) -{ - int i; - - if (!p) - return; - for (i=1 ; i32) - return; - if (priv || - current->uid==p->uid || - current->euid==p->uid || - current->uid==p->euid || - current->euid==p->euid) - p->signal |= (1<<(sig-1)); -} - -void do_kill(long pid,long sig,int priv) -{ - struct task_struct **p = NR_TASKS + task; - - if (!pid) while (--p > &FIRST_TASK) { - if (*p && (*p)->pgrp == current->pid) - send_sig(sig,*p,priv); - } else if (pid>0) while (--p > &FIRST_TASK) { - if (*p && (*p)->pid == pid) - send_sig(sig,*p,priv); - } else if (pid == -1) while (--p > &FIRST_TASK) - send_sig(sig,*p,priv); - else while (--p > &FIRST_TASK) - if (*p && (*p)->pgrp == -pid) - send_sig(sig,*p,priv); -} - -int sys_kill(int pid,int sig) -{ - do_kill(pid,sig,!(current->uid || current->euid)); - return 0; -} - -int do_exit(long code) -{ - int i; - - free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); - free_page_tables(get_base(current->ldt[2]),get_limit(0x17)); - for (i=0 ; ifather == current->pid) - task[i]->father = 0; - for (i=0 ; ifilp[i]) - sys_close(i); - iput(current->pwd); - current->pwd=NULL; - iput(current->root); - current->root=NULL; - if (current->leader && current->tty >= 0) - tty_table[current->tty].pgrp = 0; - if (last_task_used_math == current) - last_task_used_math = NULL; - if (current->father) { - current->state = TASK_ZOMBIE; - do_kill(current->father,SIGCHLD,1); - current->exit_code = code; - } else - release(current); - schedule(); - return (-1); /* just to suppress warnings */ -} - -int sys_exit(int error_code) -{ - return do_exit((error_code&0xff)<<8); -} - -int sys_waitpid(pid_t pid,int * stat_addr, int options) -{ - int flag=0; - struct task_struct ** p; - - verify_area(stat_addr,4); -repeat: - for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) - if (*p && *p != current && - (pid==-1 || (*p)->pid==pid || - (pid==0 && (*p)->pgrp==current->pgrp) || - (pid<0 && (*p)->pgrp==-pid))) - if ((*p)->father == current->pid) { - flag=1; - if ((*p)->state==TASK_ZOMBIE) { - put_fs_long((*p)->exit_code, - (unsigned long *) stat_addr); - current->cutime += (*p)->utime; - current->cstime += (*p)->stime; - flag = (*p)->pid; - release(*p); - return flag; - } - } - if (flag) { - if (options & WNOHANG) - return 0; - sys_pause(); - if (!(current->signal &= ~(1<<(SIGCHLD-1)))) - goto repeat; - else - return -EINTR; - } - return -ECHILD; -} - - -/* - * 'fork.c' contains the help-routines for the 'fork' system call - * (see also system_call.s), and some misc functions ('verify_area'). - * Fork is rather simple, once you get the hang of it, but the memory - * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()' - */ -#include - -#include -#include -#include -#include - -extern void write_verify(unsigned long address); - -long last_pid=0; - -void verify_area(void * addr,int size) -{ - unsigned long start; - - start = (unsigned long) addr; - size += start & 0xfff; - start &= 0xfffff000; - start += get_base(current->ldt[2]); - while (size>0) { - size -= 4096; - write_verify(start); - start += 4096; - } -} - -int copy_mem(int nr,struct task_struct * p) -{ - unsigned long old_data_base,new_data_base,data_limit; - unsigned long old_code_base,new_code_base,code_limit; - - code_limit=get_limit(0x0f); - data_limit=get_limit(0x17); - old_code_base = get_base(current->ldt[1]); - old_data_base = get_base(current->ldt[2]); - if (old_data_base != old_code_base) - panic("We don't support separate I&D"); - if (data_limit < code_limit) - panic("Bad data_limit"); - new_data_base = new_code_base = nr * 0x4000000; - set_base(p->ldt[1],new_code_base); - set_base(p->ldt[2],new_data_base); - if (copy_page_tables(old_data_base,new_data_base,data_limit)) { - free_page_tables(new_data_base,data_limit); - return -ENOMEM; - } - return 0; -} - -/* - * Ok, this is the main fork-routine. It copies the system process - * information (task[nr]) and sets up the necessary registers. It - * also copies the data segment in it's entirety. - */ -int copy_process(int nr,long ebp,long edi,long esi,long gs,long none, - long ebx,long ecx,long edx, - long fs,long es,long ds, - long eip,long cs,long eflags,long esp,long ss) -{ - struct task_struct *p; - int i; - struct file *f; - - p = (struct task_struct *) get_free_page(); - if (!p) - return -EAGAIN; - *p = *current; /* NOTE! this doesn't copy the supervisor stack */ - p->state = TASK_RUNNING; - p->pid = last_pid; - p->father = current->pid; - p->counter = p->priority; - p->signal = 0; - p->alarm = 0; - p->leader = 0; /* process leadership doesn't inherit */ - p->utime = p->stime = 0; - p->cutime = p->cstime = 0; - p->start_time = jiffies; - p->tss.back_link = 0; - p->tss.esp0 = PAGE_SIZE + (long) p; - p->tss.ss0 = 0x10; - p->tss.eip = eip; - p->tss.eflags = eflags; - p->tss.eax = 0; - p->tss.ecx = ecx; - p->tss.edx = edx; - p->tss.ebx = ebx; - p->tss.esp = esp; - p->tss.ebp = ebp; - p->tss.esi = esi; - p->tss.edi = edi; - p->tss.es = es & 0xffff; - p->tss.cs = cs & 0xffff; - p->tss.ss = ss & 0xffff; - p->tss.ds = ds & 0xffff; - p->tss.fs = fs & 0xffff; - p->tss.gs = gs & 0xffff; - p->tss.ldt = _LDT(nr); - p->tss.trace_bitmap = 0x80000000; - if (last_task_used_math == current) - __asm__("fnsave %0"::"m" (p->tss.i387)); - if (copy_mem(nr,p)) { - free_page((long) p); - return -EAGAIN; - } - for (i=0; ifilp[i]) - f->f_count++; - if (current->pwd) - current->pwd->i_count++; - if (current->root) - current->root->i_count++; - set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); - set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt)); - task[nr] = p; /* do this last, just in case */ - return last_pid; -} - -int find_empty_process(void) -{ - int i; - - repeat: - if ((++last_pid)<0) last_pid=1; - for(i=0 ; ipid == last_pid) goto repeat; - for(i=1 ; i -#include -#include -#include -#include -#include -#include -#include - -/* - * This code handles all hd-interrupts, and read/write requests to - * the hard-disk. It is relatively straigthforward (not obvious maybe, - * but interrupts never are), while still being efficient, and never - * disabling interrupts (except to overcome possible race-condition). - * The elevator block-seek algorithm doesn't need to disable interrupts - * due to clever programming. - */ - -/* Max read/write errors/sector */ -#define MAX_ERRORS 5 -#define MAX_HD 2 -#define NR_REQUEST 32 - -/* - * This struct defines the HD's and their types. - * Currently defined for CP3044's, ie a modified - * type 17. - */ -static struct hd_i_struct{ - int head,sect,cyl,wpcom,lzone,ctl; - } hd_info[]= { HD_TYPE }; - -#define NR_HD ((sizeof (hd_info))/(sizeof (struct hd_i_struct))) - -static struct hd_struct { - long start_sect; - long nr_sects; -} hd[5*MAX_HD]={{0,0},}; - -static struct hd_request { - int hd; /* -1 if no request */ - int nsector; - int sector; - int head; - int cyl; - int cmd; - int errors; - struct buffer_head * bh; - struct hd_request * next; -} request[NR_REQUEST]; - -#define IN_ORDER(s1,s2) \ -((s1)->hd<(s2)->hd || (s1)->hd==(s2)->hd && \ -((s1)->cyl<(s2)->cyl || (s1)->cyl==(s2)->cyl && \ -((s1)->head<(s2)->head || (s1)->head==(s2)->head && \ -((s1)->sector<(s2)->sector)))) - -static struct hd_request * this_request = NULL; - -static int sorting=0; - -static void do_request(void); -static void reset_controller(void); -static void rw_abs_hd(int rw,unsigned int nr,unsigned int sec,unsigned int head, - unsigned int cyl,struct buffer_head * bh); -void hd_init(void); - -#define port_read(port,buf,nr) \ -__asm__("cld;rep;insw"::"d" (port),"D" (buf),"c" (nr):"cx","di") - -#define port_write(port,buf,nr) \ -__asm__("cld;rep;outsw"::"d" (port),"S" (buf),"c" (nr):"cx","si") - -extern void hd_interrupt(void); - -static struct task_struct * wait_for_request=NULL; - -static inline void lock_buffer(struct buffer_head * bh) -{ - if (bh->b_lock) - printk("hd.c: buffer multiply locked\n"); - bh->b_lock=1; -} - -static inline void unlock_buffer(struct buffer_head * bh) -{ - if (!bh->b_lock) - printk("hd.c: free buffer being unlocked\n"); - bh->b_lock=0; - wake_up(&bh->b_wait); -} - -static inline void wait_on_buffer(struct buffer_head * bh) -{ - cli(); - while (bh->b_lock) - sleep_on(&bh->b_wait); - sti(); -} - -void rw_hd(int rw, struct buffer_head * bh) -{ - unsigned int block,dev; - unsigned int sec,head,cyl; - - block = bh->b_blocknr << 1; - dev = MINOR(bh->b_dev); - if (dev >= 5*NR_HD || block+2 > hd[dev].nr_sects) - return; - block += hd[dev].start_sect; - dev /= 5; - __asm__("divl %4":"=a" (block),"=d" (sec):"0" (block),"1" (0), - "r" (hd_info[dev].sect)); - __asm__("divl %4":"=a" (cyl),"=d" (head):"0" (block),"1" (0), - "r" (hd_info[dev].head)); - rw_abs_hd(rw,dev,sec+1,head,cyl,bh); -} - -/* This may be used only once, enforced by 'static int callable' */ -int sys_setup(void) -{ - static int callable = 1; - int i,drive; - struct partition *p; - - if (!callable) - return -1; - callable = 0; - for (drive=0 ; driveb_uptodate) { - printk("Unable to read partition table of drive %d\n\r", - drive); - panic(""); - } - if (start_buffer->b_data[510] != 0x55 || (unsigned char) - start_buffer->b_data[511] != 0xAA) { - printk("Bad partition table on drive %d\n\r",drive); - panic(""); - } - p = 0x1BE + (void *)start_buffer->b_data; - for (i=1;i<5;i++,p++) { - hd[i+5*drive].start_sect = p->start_sect; - hd[i+5*drive].nr_sects = p->nr_sects; - } - } - printk("Partition table%s ok.\n\r",(NR_HD>1)?"s":""); - mount_root(); - return (0); -} - -/* - * This is the pointer to a routine to be executed at every hd-interrupt. - * Interesting way of doing things, but should be rather practical. - */ -void (*do_hd)(void) = NULL; - -static int controller_ready(void) -{ - int retries=1000; - - while (--retries && (inb(HD_STATUS)&0xc0)!=0x40); - return (retries); -} - -static int win_result(void) -{ - int i=inb(HD_STATUS); - - if ((i & (BUSY_STAT | READY_STAT | WRERR_STAT | SEEK_STAT | ERR_STAT)) - == (READY_STAT | SEEK_STAT)) - return(0); /* ok */ - if (i&1) i=inb(HD_ERROR); - return (1); -} - -static void hd_out(unsigned int drive,unsigned int nsect,unsigned int sect, - unsigned int head,unsigned int cyl,unsigned int cmd, - void (*intr_addr)(void)) -{ - register int port asm("dx"); - - if (drive>1 || head>15) - panic("Trying to write bad sector"); - if (!controller_ready()) - panic("HD controller not ready"); - do_hd = intr_addr; - outb(_CTL,HD_CMD); - port=HD_DATA; - outb_p(_WPCOM,++port); - outb_p(nsect,++port); - outb_p(sect,++port); - outb_p(cyl,++port); - outb_p(cyl>>8,++port); - outb_p(0xA0|(drive<<4)|head,++port); - outb(cmd,++port); -} - -static int drive_busy(void) -{ - unsigned int i; - - for (i = 0; i < 100000; i++) - if (READY_STAT == (inb(HD_STATUS) & (BUSY_STAT | READY_STAT))) - break; - i = inb(HD_STATUS); - i &= BUSY_STAT | READY_STAT | SEEK_STAT; - if (i == READY_STAT | SEEK_STAT) - return(0); - printk("HD controller times out\n\r"); - return(1); -} - -static void reset_controller(void) -{ - int i; - - outb(4,HD_CMD); - for(i = 0; i < 1000; i++) nop(); - outb(0,HD_CMD); - for(i = 0; i < 10000 && drive_busy(); i++) /* nothing */; - if (drive_busy()) - printk("HD-controller still busy\n\r"); - if((i = inb(ERR_STAT)) != 1) - printk("HD-controller reset failed: %02x\n\r",i); -} - -static void reset_hd(int nr) -{ - reset_controller(); - hd_out(nr,_SECT,_SECT,_HEAD-1,_CYL,WIN_SPECIFY,&do_request); -} - -void unexpected_hd_interrupt(void) -{ - panic("Unexpected HD interrupt\n\r"); -} - -static void bad_rw_intr(void) -{ - int i = this_request->hd; - - if (this_request->errors++ >= MAX_ERRORS) { - this_request->bh->b_uptodate = 0; - unlock_buffer(this_request->bh); - wake_up(&wait_for_request); - this_request->hd = -1; - this_request=this_request->next; - } - reset_hd(i); -} - -static void read_intr(void) -{ - if (win_result()) { - bad_rw_intr(); - return; - } - port_read(HD_DATA,this_request->bh->b_data+ - 512*(this_request->nsector&1),256); - this_request->errors = 0; - if (--this_request->nsector) - return; - this_request->bh->b_uptodate = 1; - this_request->bh->b_dirt = 0; - wake_up(&wait_for_request); - unlock_buffer(this_request->bh); - this_request->hd = -1; - this_request=this_request->next; - do_request(); -} - -static void write_intr(void) -{ - if (win_result()) { - bad_rw_intr(); - return; - } - if (--this_request->nsector) { - port_write(HD_DATA,this_request->bh->b_data+512,256); - return; - } - this_request->bh->b_uptodate = 1; - this_request->bh->b_dirt = 0; - wake_up(&wait_for_request); - unlock_buffer(this_request->bh); - this_request->hd = -1; - this_request=this_request->next; - do_request(); -} - -static void do_request(void) -{ - int i,r; - - if (sorting) - return; - if (!this_request) { - do_hd=NULL; - return; - } - if (this_request->cmd == WIN_WRITE) { - hd_out(this_request->hd,this_request->nsector,this_request-> - sector,this_request->head,this_request->cyl, - this_request->cmd,&write_intr); - for(i=0 ; i<3000 && !(r=inb_p(HD_STATUS)&DRQ_STAT) ; i++) - /* nothing */ ; - if (!r) { - reset_hd(this_request->hd); - return; - } - port_write(HD_DATA,this_request->bh->b_data+ - 512*(this_request->nsector&1),256); - } else if (this_request->cmd == WIN_READ) { - hd_out(this_request->hd,this_request->nsector,this_request-> - sector,this_request->head,this_request->cyl, - this_request->cmd,&read_intr); - } else - panic("unknown hd-command"); -} - -/* - * add-request adds a request to the linked list. - * It sets the 'sorting'-variable when doing something - * that interrupts shouldn't touch. - */ -static void add_request(struct hd_request * req) -{ - struct hd_request * tmp; - - if (req->nsector != 2) - panic("nsector!=2 not implemented"); -/* - * Not to mess up the linked lists, we never touch the two first - * entries (not this_request, as it is used by current interrups, - * and not this_request->next, as it can be assigned to this_request). - * This is not too high a price to pay for the ability of not - * disabling interrupts. - */ - sorting=1; - if (!(tmp=this_request)) - this_request=req; - else { - if (!(tmp->next)) - tmp->next=req; - else { - tmp=tmp->next; - for ( ; tmp->next ; tmp=tmp->next) - if ((IN_ORDER(tmp,req) || - !IN_ORDER(tmp,tmp->next)) && - IN_ORDER(req,tmp->next)) - break; - req->next=tmp->next; - tmp->next=req; - } - } - sorting=0; -/* - * NOTE! As a result of sorting, the interrupts may have died down, - * as they aren't redone due to locking with sorting=1. They might - * also never have started, if this is the first request in the queue, - * so we restart them if necessary. - */ - if (!do_hd) - do_request(); -} - -void rw_abs_hd(int rw,unsigned int nr,unsigned int sec,unsigned int head, - unsigned int cyl,struct buffer_head * bh) -{ - struct hd_request * req; - - if (rw!=READ && rw!=WRITE) - panic("Bad hd command, must be R/W"); - lock_buffer(bh); -repeat: - for (req=0+request ; reqhd<0) - break; - if (req==NR_REQUEST+request) { - sleep_on(&wait_for_request); - goto repeat; - } - req->hd=nr; - req->nsector=2; - req->sector=sec; - req->head=head; - req->cyl=cyl; - req->cmd = ((rw==READ)?WIN_READ:WIN_WRITE); - req->bh=bh; - req->errors=0; - req->next=NULL; - add_request(req); - wait_on_buffer(bh); -} - -void hd_init(void) -{ - int i; - - for (i=0 ; i - -/* - * This isn't the library routine, it is only used in the kernel. - * as such, we don't care about years<1970 etc, but assume everything - * is ok. Similarly, TZ etc is happily ignored. We just do everything - * as easily as possible. Let's find something public for the library - * routines (although I think minix times is public). - */ -/* - * PS. I hate whoever though up the year 1970 - couldn't they have gotten - * a leap-year instead? I also hate Gregorius, pope or no. I'm grumpy. - */ -#define MINUTE 60 -#define HOUR (60*MINUTE) -#define DAY (24*HOUR) -#define YEAR (365*DAY) - -/* interestingly, we assume leap-years */ -static int month[12] = { - 0, - DAY*(31), - DAY*(31+29), - DAY*(31+29+31), - DAY*(31+29+31+30), - DAY*(31+29+31+30+31), - DAY*(31+29+31+30+31+30), - DAY*(31+29+31+30+31+30+31), - DAY*(31+29+31+30+31+30+31+31), - DAY*(31+29+31+30+31+30+31+31+30), - DAY*(31+29+31+30+31+30+31+31+30+31), - DAY*(31+29+31+30+31+30+31+31+30+31+30) -}; - -long kernel_mktime(struct tm * tm) -{ - long res; - int year; - - year = tm->tm_year - 70; -/* magic offsets (y+1) needed to get leapyears right.*/ - res = YEAR*year + DAY*((year+1)/4); - res += month[tm->tm_mon]; -/* and (y+2) here. If it wasn't a leap-year, we have to adjust */ - if (tm->tm_mon>1 && ((year+2)%4)) - res -= DAY; - res += DAY*(tm->tm_mday-1); - res += HOUR*tm->tm_hour; - res += MINUTE*tm->tm_min; - res += tm->tm_sec; - return res; -} -/* - * This function is used through-out the kernel (includeinh mm and fs) - * to indicate a major problem. - */ -#include - -volatile void panic(const char * s) -{ - printk("Kernel panic: %s\n\r",s); - for(;;); -} -/* - * When in kernel-mode, we cannot use printf, as fs is liable to - * point to 'interesting' things. Make a printf with fs-saving, and - * all is well. - */ -#include -#include - -#include - -static char buf[1024]; - -int printk(const char *fmt, ...) -{ - va_list args; - int i; - - va_start(args, fmt); - i=vsprintf(buf,fmt,args); - va_end(args); - __asm__("push %%fs\n\t" - "push %%ds\n\t" - "pop %%fs\n\t" - "pushl %0\n\t" - "pushl $_buf\n\t" - "pushl $0\n\t" - "call _tty_write\n\t" - "addl $8,%%esp\n\t" - "popl %0\n\t" - "pop %%fs" - ::"r" (i):"ax","cx","dx"); - return i; -} -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid(), which just extracts a field from - * current-task - */ -#include -#include -#include -#include -#include -#include -#include - -#define LATCH (1193180/HZ) - -extern void mem_use(void); - -extern int timer_interrupt(void); -extern int system_call(void); - -union task_union { - struct task_struct task; - char stack[PAGE_SIZE]; -}; - -static union task_union init_task = {INIT_TASK,}; - -long volatile jiffies=0; -long startup_time=0; -struct task_struct *current = &(init_task.task), *last_task_used_math = NULL; - -struct task_struct * task[NR_TASKS] = {&(init_task.task), }; - -long user_stack [ PAGE_SIZE>>2 ] ; - -struct { - long * a; - short b; - } stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 }; -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - */ -void math_state_restore() -{ - if (last_task_used_math) - __asm__("fnsave %0"::"m" (last_task_used_math->tss.i387)); - if (current->used_math) - __asm__("frstor %0"::"m" (current->tss.i387)); - else { - __asm__("fninit"::); - current->used_math=1; - } - last_task_used_math=current; -} - -/* - * 'schedule()' is the scheduler function. This is GOOD CODE! There - * probably won't be any reason to change this, as it should work well - * in all circumstances (ie gives IO-bound processes good response etc). - * The one thing you might take a look at is the signal-handler code here. - * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. - */ -void schedule(void) -{ - int i,next,c; - struct task_struct ** p; - -/* check alarm, wake up any interruptible tasks that have got a signal */ - - for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) - if (*p) { - if ((*p)->alarm && (*p)->alarm < jiffies) { - (*p)->signal |= (1<<(SIGALRM-1)); - (*p)->alarm = 0; - } - if ((*p)->signal && (*p)->state==TASK_INTERRUPTIBLE) - (*p)->state=TASK_RUNNING; - } - -/* this is the scheduler proper: */ - - while (1) { - c = -1; - next = 0; - i = NR_TASKS; - p = &task[NR_TASKS]; - while (--i) { - if (!*--p) - continue; - if ((*p)->state == TASK_RUNNING && (*p)->counter > c) - c = (*p)->counter, next = i; - } - if (c) break; - for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) - if (*p) - (*p)->counter = ((*p)->counter >> 1) + - (*p)->priority; - } - switch_to(next); -} - -int sys_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return 0; -} - -void sleep_on(struct task_struct **p) -{ - struct task_struct *tmp; - - if (!p) - return; - if (current == &(init_task.task)) - panic("task[0] trying to sleep"); - tmp = *p; - *p = current; - current->state = TASK_UNINTERRUPTIBLE; - schedule(); - if (tmp) - tmp->state=0; -} - -void interruptible_sleep_on(struct task_struct **p) -{ - struct task_struct *tmp; - - if (!p) - return; - if (current == &(init_task.task)) - panic("task[0] trying to sleep"); - tmp=*p; - *p=current; -repeat: current->state = TASK_INTERRUPTIBLE; - schedule(); - if (*p && *p != current) { - (**p).state=0; - goto repeat; - } - *p=NULL; - if (tmp) - tmp->state=0; -} - -void wake_up(struct task_struct **p) -{ - if (p && *p) { - (**p).state=0; - *p=NULL; - } -} - -void do_timer(long cpl) -{ - if (cpl) - current->utime++; - else - current->stime++; - if ((--current->counter)>0) return; - current->counter=0; - if (!cpl) return; - schedule(); -} - -int sys_alarm(long seconds) -{ - current->alarm = (seconds>0)?(jiffies+HZ*seconds):0; - return seconds; -} - -int sys_getpid(void) -{ - return current->pid; -} - -int sys_getppid(void) -{ - return current->father; -} - -int sys_getuid(void) -{ - return current->uid; -} - -int sys_geteuid(void) -{ - return current->euid; -} - -int sys_getgid(void) -{ - return current->gid; -} - -int sys_getegid(void) -{ - return current->egid; -} - -int sys_nice(long increment) -{ - if (current->priority-increment>0) - current->priority -= increment; - return 0; -} - -int sys_signal(long signal,long addr,long restorer) -{ - long i; - - switch (signal) { - case SIGHUP: case SIGINT: case SIGQUIT: case SIGILL: - case SIGTRAP: case SIGABRT: case SIGFPE: case SIGUSR1: - case SIGSEGV: case SIGUSR2: case SIGPIPE: case SIGALRM: - case SIGCHLD: - i=(long) current->sig_fn[signal-1]; - current->sig_fn[signal-1] = (fn_ptr) addr; - current->sig_restorer = (fn_ptr) restorer; - return i; - default: return -1; - } -} - -void sched_init(void) -{ - int i; - struct desc_struct * p; - - set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss)); - set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt)); - p = gdt+2+FIRST_TSS_ENTRY; - for(i=1;ia=p->b=0; - p++; - p->a=p->b=0; - p++; - } - ltr(0); - lldt(0); - outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff , 0x40); /* LSB */ - outb(LATCH >> 8 , 0x40); /* MSB */ - set_intr_gate(0x20,&timer_interrupt); - outb(inb_p(0x21)&~0x01,0x21); - set_system_gate(0x80,&system_call); -} -/* - * serial.c - * - * This module implements the rs232 io functions - * void rs_write(struct tty_struct * queue); - * void rs_init(void); - * and all interrupts pertaining to serial IO. - */ - -#include -#include -#include -#include - -#define WAKEUP_CHARS (TTY_BUF_SIZE/4) - -extern void rs1_interrupt(void); -extern void rs2_interrupt(void); - -static void init(int port) -{ - outb_p(0x80,port+3); /* set DLAB of line control reg */ - outb_p(0x30,port); /* LS of divisor (48 -> 2400 bps */ - outb_p(0x00,port+1); /* MS of divisor */ - outb_p(0x03,port+3); /* reset DLAB */ - outb_p(0x0b,port+4); /* set DTR,RTS, OUT_2 */ - outb_p(0x0d,port+1); /* enable all intrs but writes */ - (void)inb(port); /* read data port to reset things (?) */ -} - -void rs_init(void) -{ - set_intr_gate(0x24,rs1_interrupt); - set_intr_gate(0x23,rs2_interrupt); - init(tty_table[1].read_q.data); - init(tty_table[2].read_q.data); - outb(inb_p(0x21)&0xE7,0x21); -} - -/* - * This routine gets called when tty_write has put something into - * the write_queue. It must check wheter the queue is empty, and - * set the interrupt register accordingly - * - * void _rs_write(struct tty_struct * tty); - */ -void rs_write(struct tty_struct * tty) -{ - cli(); - if (!EMPTY(tty->write_q)) - outb(inb_p(tty->write_q.data+1)|0x02,tty->write_q.data+1); - sti(); -} -#include - -#include -#include -#include -#include -#include -#include - -int sys_ftime() -{ - return -ENOSYS; -} - -int sys_mknod() -{ - return -ENOSYS; -} - -int sys_break() -{ - return -ENOSYS; -} - -int sys_mount() -{ - return -ENOSYS; -} - -int sys_umount() -{ - return -ENOSYS; -} - -int sys_ustat(int dev,struct ustat * ubuf) -{ - return -1; -} - -int sys_ptrace() -{ - return -ENOSYS; -} - -int sys_stty() -{ - return -ENOSYS; -} - -int sys_gtty() -{ - return -ENOSYS; -} - -int sys_rename() -{ - return -ENOSYS; -} - -int sys_prof() -{ - return -ENOSYS; -} - -int sys_setgid(int gid) -{ - if (current->euid && current->uid) - if (current->gid==gid || current->sgid==gid) - current->egid=gid; - else - return -EPERM; - else - current->gid=current->egid=gid; - return 0; -} - -int sys_acct() -{ - return -ENOSYS; -} - -int sys_phys() -{ - return -ENOSYS; -} - -int sys_lock() -{ - return -ENOSYS; -} - -int sys_mpx() -{ - return -ENOSYS; -} - -int sys_ulimit() -{ - return -ENOSYS; -} - -int sys_time(long * tloc) -{ - int i; - - i = CURRENT_TIME; - if (tloc) { - verify_area(tloc,4); - put_fs_long(i,(unsigned long *)tloc); - } - return i; -} - -int sys_setuid(int uid) -{ - if (current->euid && current->uid) - if (uid==current->uid || current->suid==current->uid) - current->euid=uid; - else - return -EPERM; - else - current->euid=current->uid=uid; - return 0; -} - -int sys_stime(long * tptr) -{ - if (current->euid && current->uid) - return -1; - startup_time = get_fs_long((unsigned long *)tptr) - jiffies/HZ; - return 0; -} - -int sys_times(struct tms * tbuf) -{ - if (!tbuf) - return jiffies; - verify_area(tbuf,sizeof *tbuf); - put_fs_long(current->utime,(unsigned long *)&tbuf->tms_utime); - put_fs_long(current->stime,(unsigned long *)&tbuf->tms_stime); - put_fs_long(current->cutime,(unsigned long *)&tbuf->tms_cutime); - put_fs_long(current->cstime,(unsigned long *)&tbuf->tms_cstime); - return jiffies; -} - -int sys_brk(unsigned long end_data_seg) -{ - if (end_data_seg >= current->end_code && - end_data_seg < current->start_stack - 16384) - current->brk = end_data_seg; - return current->brk; -} - -/* - * This needs some heave checking ... - * I just haven't get the stomach for it. I also don't fully - * understand sessions/pgrp etc. Let somebody who does explain it. - */ -int sys_setpgid(int pid, int pgid) -{ - int i; - - if (!pid) - pid = current->pid; - if (!pgid) - pgid = pid; - for (i=0 ; ipid==pid) { - if (task[i]->leader) - return -EPERM; - if (task[i]->session != current->session) - return -EPERM; - task[i]->pgrp = pgid; - return 0; - } - return -ESRCH; -} - -int sys_getpgrp(void) -{ - return current->pgrp; -} - -int sys_setsid(void) -{ - if (current->uid && current->euid) - return -EPERM; - if (current->leader) - return -EPERM; - current->leader = 1; - current->session = current->pgrp = current->pid; - current->tty = -1; - return current->pgrp; -} - -int sys_uname(struct utsname * name) -{ - static struct utsname thisname = { - "linux .0","nodename","release ","version ","machine " - }; - int i; - - if (!name) return -1; - verify_area(name,sizeof *name); - for(i=0;iumask; - - current->umask = mask & 0777; - return (old); -} -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. Currently mostly a debugging-aid, will be extended - * to mainly kill the offending process (probably by giving it a signal, - * but possibly by killing it outright if necessary). - */ -#include - -#include -#include -#include -#include -#include - -#define get_seg_byte(seg,addr) ({ \ -register char __res; \ -__asm__("push %%fs;mov %%ax,%%fs;movb %%fs:%2,%%al;pop %%fs" \ - :"=a" (__res):"0" (seg),"m" (*(addr))); \ -__res;}) - -#define get_seg_long(seg,addr) ({ \ -register unsigned long __res; \ -__asm__("push %%fs;mov %%ax,%%fs;movl %%fs:%2,%%eax;pop %%fs" \ - :"=a" (__res):"0" (seg),"m" (*(addr))); \ -__res;}) - -#define _fs() ({ \ -register unsigned short __res; \ -__asm__("mov %%fs,%%ax":"=a" (__res):); \ -__res;}) - -int do_exit(long code); - -void page_exception(void); - -void divide_error(void); -void debug(void); -void nmi(void); -void int3(void); -void overflow(void); -void bounds(void); -void invalid_op(void); -void device_not_available(void); -void double_fault(void); -void coprocessor_segment_overrun(void); -void invalid_TSS(void); -void segment_not_present(void); -void stack_segment(void); -void general_protection(void); -void page_fault(void); -void coprocessor_error(void); -void reserved(void); - -static void die(char * str,long esp_ptr,long nr) -{ - long * esp = (long *) esp_ptr; - int i; - - printk("%s: %04x\n\r",str,nr&0xffff); - printk("EIP:\t%04x:%p\nEFLAGS:\t%p\nESP:\t%04x:%p\n", - esp[1],esp[0],esp[2],esp[4],esp[3]); - printk("fs: %04x\n",_fs()); - printk("base: %p, limit: %p\n",get_base(current->ldt[1]),get_limit(0x17)); - if (esp[4] == 0x17) { - printk("Stack: "); - for (i=0;i<4;i++) - printk("%p ",get_seg_long(0x17,i+(long *)esp[3])); - printk("\n"); - } - str(i); - printk("Pid: %d, process nr: %d\n\r",current->pid,0xffff & i); - for(i=0;i<10;i++) - printk("%02x ",0xff & get_seg_byte(esp[1],(i+(char *)esp[0]))); - printk("\n\r"); - do_exit(11); /* play segment exception */ -} - -void do_double_fault(long esp, long error_code) -{ - die("double fault",esp,error_code); -} - -void do_general_protection(long esp, long error_code) -{ - die("general protection",esp,error_code); -} - -void do_divide_error(long esp, long error_code) -{ - die("divide error",esp,error_code); -} - -void do_int3(long * esp, long error_code, - long fs,long es,long ds, - long ebp,long esi,long edi, - long edx,long ecx,long ebx,long eax) -{ - int tr; - - __asm__("str %%ax":"=a" (tr):"0" (0)); - printk("eax\t\tebx\t\tecx\t\tedx\n\r%8x\t%8x\t%8x\t%8x\n\r", - eax,ebx,ecx,edx); - printk("esi\t\tedi\t\tebp\t\tesp\n\r%8x\t%8x\t%8x\t%8x\n\r", - esi,edi,ebp,(long) esp); - printk("\n\rds\tes\tfs\ttr\n\r%4x\t%4x\t%4x\t%4x\n\r", - ds,es,fs,tr); - printk("EIP: %8x CS: %4x EFLAGS: %8x\n\r",esp[0],esp[1],esp[2]); -} - -void do_nmi(long esp, long error_code) -{ - die("nmi",esp,error_code); -} - -void do_debug(long esp, long error_code) -{ - die("debug",esp,error_code); -} - -void do_overflow(long esp, long error_code) -{ - die("overflow",esp,error_code); -} - -void do_bounds(long esp, long error_code) -{ - die("bounds",esp,error_code); -} - -void do_invalid_op(long esp, long error_code) -{ - die("invalid operand",esp,error_code); -} - -void do_device_not_available(long esp, long error_code) -{ - die("device not available",esp,error_code); -} - -void do_coprocessor_segment_overrun(long esp, long error_code) -{ - die("coprocessor segment overrun",esp,error_code); -} - -void do_invalid_TSS(long esp,long error_code) -{ - die("invalid TSS",esp,error_code); -} - -void do_segment_not_present(long esp,long error_code) -{ - die("segment not present",esp,error_code); -} - -void do_stack_segment(long esp,long error_code) -{ - die("stack segment",esp,error_code); -} - -void do_coprocessor_error(long esp, long error_code) -{ - die("coprocessor error",esp,error_code); -} - -void do_reserved(long esp, long error_code) -{ - die("reserved (15,17-31) error",esp,error_code); -} - -void trap_init(void) -{ - int i; - - set_trap_gate(0,÷_error); - set_trap_gate(1,&debug); - set_trap_gate(2,&nmi); - set_system_gate(3,&int3); /* int3-5 can be called from all */ - set_system_gate(4,&overflow); - set_system_gate(5,&bounds); - set_trap_gate(6,&invalid_op); - set_trap_gate(7,&device_not_available); - set_trap_gate(8,&double_fault); - set_trap_gate(9,&coprocessor_segment_overrun); - set_trap_gate(10,&invalid_TSS); - set_trap_gate(11,&segment_not_present); - set_trap_gate(12,&stack_segment); - set_trap_gate(13,&general_protection); - set_trap_gate(14,&page_fault); - set_trap_gate(15,&reserved); - set_trap_gate(16,&coprocessor_error); - for (i=17;i<32;i++) - set_trap_gate(i,&reserved); -/* __asm__("movl $0x3ff000,%%eax\n\t" - "movl %%eax,%%db0\n\t" - "movl $0x000d0303,%%eax\n\t" - "movl %%eax,%%db7" - :::"ax");*/ -} - -/* - * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles - * or rs-channels. It also implements echoing, cooked mode etc (well, - * not currently, but ...) - */ -#include -#include -#include - -#define ALRMMASK (1<<(SIGALRM-1)) - -#include -#include -#include -#include - -#define _L_FLAG(tty,f) ((tty)->termios.c_lflag & f) -#define _I_FLAG(tty,f) ((tty)->termios.c_iflag & f) -#define _O_FLAG(tty,f) ((tty)->termios.c_oflag & f) - -#define L_CANON(tty) _L_FLAG((tty),ICANON) -#define L_ISIG(tty) _L_FLAG((tty),ISIG) -#define L_ECHO(tty) _L_FLAG((tty),ECHO) -#define L_ECHOE(tty) _L_FLAG((tty),ECHOE) -#define L_ECHOK(tty) _L_FLAG((tty),ECHOK) -#define L_ECHOCTL(tty) _L_FLAG((tty),ECHOCTL) -#define L_ECHOKE(tty) _L_FLAG((tty),ECHOKE) - -#define I_UCLC(tty) _I_FLAG((tty),IUCLC) -#define I_NLCR(tty) _I_FLAG((tty),INLCR) -#define I_CRNL(tty) _I_FLAG((tty),ICRNL) -#define I_NOCR(tty) _I_FLAG((tty),IGNCR) - -#define O_POST(tty) _O_FLAG((tty),OPOST) -#define O_NLCR(tty) _O_FLAG((tty),ONLCR) -#define O_CRNL(tty) _O_FLAG((tty),OCRNL) -#define O_NLRET(tty) _O_FLAG((tty),ONLRET) -#define O_LCUC(tty) _O_FLAG((tty),OLCUC) - -struct tty_struct tty_table[] = { - { - {0, - OPOST|ONLCR, /* change outgoing NL to CRNL */ - 0, - ICANON | ECHO | ECHOCTL | ECHOKE, - 0, /* console termio */ - INIT_C_CC}, - 0, /* initial pgrp */ - 0, /* initial stopped */ - con_write, - {0,0,0,0,""}, /* console read-queue */ - {0,0,0,0,""}, /* console write-queue */ - {0,0,0,0,""} /* console secondary queue */ - },{ - {0, /*IGNCR*/ - OPOST | ONLRET, /* change outgoing NL to CR */ - B2400 | CS8, - 0, - 0, - INIT_C_CC}, - 0, - 0, - rs_write, - {0x3f8,0,0,0,""}, /* rs 1 */ - {0x3f8,0,0,0,""}, - {0,0,0,0,""} - },{ - {0, /*IGNCR*/ - OPOST | ONLRET, /* change outgoing NL to CR */ - B2400 | CS8, - 0, - 0, - INIT_C_CC}, - 0, - 0, - rs_write, - {0x2f8,0,0,0,""}, /* rs 2 */ - {0x2f8,0,0,0,""}, - {0,0,0,0,""} - } -}; - -/* - * these are the tables used by the machine code handlers. - * you can implement pseudo-tty's or something by changing - * them. Currently not done. - */ -struct tty_queue * table_list[]={ - &tty_table[0].read_q, &tty_table[0].write_q, - &tty_table[1].read_q, &tty_table[1].write_q, - &tty_table[2].read_q, &tty_table[2].write_q - }; - -void tty_init(void) -{ - rs_init(); - con_init(); -} - -void tty_intr(struct tty_struct * tty, int signal) -{ - int i; - - if (tty->pgrp <= 0) - return; - for (i=0;ipgrp==tty->pgrp) - task[i]->signal |= 1<<(signal-1); -} - -static void sleep_if_empty(struct tty_queue * queue) -{ - cli(); - while (!current->signal && EMPTY(*queue)) - interruptible_sleep_on(&queue->proc_list); - sti(); -} - -static void sleep_if_full(struct tty_queue * queue) -{ - if (!FULL(*queue)) - return; - cli(); - while (!current->signal && LEFT(*queue)<128) - interruptible_sleep_on(&queue->proc_list); - sti(); -} - -void copy_to_cooked(struct tty_struct * tty) -{ - signed char c; - - while (!EMPTY(tty->read_q) && !FULL(tty->secondary)) { - GETCH(tty->read_q,c); - if (c==13) - if (I_CRNL(tty)) - c=10; - else if (I_NOCR(tty)) - continue; - else ; - else if (c==10 && I_NLCR(tty)) - c=13; - if (I_UCLC(tty)) - c=tolower(c); - if (L_CANON(tty)) { - if (c==ERASE_CHAR(tty)) { - if (EMPTY(tty->secondary) || - (c=LAST(tty->secondary))==10 || - c==EOF_CHAR(tty)) - continue; - if (L_ECHO(tty)) { - if (c<32) - PUTCH(127,tty->write_q); - PUTCH(127,tty->write_q); - tty->write(tty); - } - DEC(tty->secondary.head); - continue; - } - if (c==STOP_CHAR(tty)) { - tty->stopped=1; - continue; - } - if (c==START_CHAR(tty)) { - tty->stopped=0; - continue; - } - } - if (!L_ISIG(tty)) { - if (c==INTR_CHAR(tty)) { - tty_intr(tty,SIGINT); - continue; - } - } - if (c==10 || c==EOF_CHAR(tty)) - tty->secondary.data++; - if (L_ECHO(tty)) { - if (c==10) { - PUTCH(10,tty->write_q); - PUTCH(13,tty->write_q); - } else if (c<32) { - if (L_ECHOCTL(tty)) { - PUTCH('^',tty->write_q); - PUTCH(c+64,tty->write_q); - } - } else - PUTCH(c,tty->write_q); - tty->write(tty); - } - PUTCH(c,tty->secondary); - } - wake_up(&tty->secondary.proc_list); -} - -int tty_read(unsigned channel, char * buf, int nr) -{ - struct tty_struct * tty; - char c, * b=buf; - int minimum,time,flag=0; - long oldalarm; - - if (channel>2 || nr<0) return -1; - tty = &tty_table[channel]; - oldalarm = current->alarm; - time = (unsigned) 10*tty->termios.c_cc[VTIME]; - minimum = (unsigned) tty->termios.c_cc[VMIN]; - if (time && !minimum) { - minimum=1; - if (flag=(!oldalarm || time+jiffiesalarm = time+jiffies; - } - if (minimum>nr) - minimum=nr; - while (nr>0) { - if (flag && (current->signal & ALRMMASK)) { - current->signal &= ~ALRMMASK; - break; - } - if (current->signal) - break; - if (EMPTY(tty->secondary) || (L_CANON(tty) && - !tty->secondary.data && LEFT(tty->secondary)>20)) { - sleep_if_empty(&tty->secondary); - continue; - } - do { - GETCH(tty->secondary,c); - if (c==EOF_CHAR(tty) || c==10) - tty->secondary.data--; - if (c==EOF_CHAR(tty) && L_CANON(tty)) - return (b-buf); - else { - put_fs_byte(c,b++); - if (!--nr) - break; - } - } while (nr>0 && !EMPTY(tty->secondary)); - if (time && !L_CANON(tty)) - if (flag=(!oldalarm || time+jiffiesalarm = time+jiffies; - else - current->alarm = oldalarm; - if (L_CANON(tty)) { - if (b-buf) - break; - } else if (b-buf >= minimum) - break; - } - current->alarm = oldalarm; - if (current->signal && !(b-buf)) - return -EINTR; - return (b-buf); -} - -int tty_write(unsigned channel, char * buf, int nr) -{ - static cr_flag=0; - struct tty_struct * tty; - char c, *b=buf; - - if (channel>2 || nr<0) return -1; - tty = channel + tty_table; - while (nr>0) { - sleep_if_full(&tty->write_q); - if (current->signal) - break; - while (nr>0 && !FULL(tty->write_q)) { - c=get_fs_byte(b); - if (O_POST(tty)) { - if (c=='\r' && O_CRNL(tty)) - c='\n'; - else if (c=='\n' && O_NLRET(tty)) - c='\r'; - if (c=='\n' && !cr_flag && O_NLCR(tty)) { - cr_flag = 1; - PUTCH(13,tty->write_q); - continue; - } - if (O_LCUC(tty)) - c=toupper(c); - } - b++; nr--; - cr_flag = 0; - PUTCH(c,tty->write_q); - } - tty->write(tty); - if (nr>0) - schedule(); - } - return (b-buf); -} - -/* - * Jeh, sometimes I really like the 386. - * This routine is called from an interrupt, - * and there should be absolutely no problem - * with sleeping even in an interrupt (I hope). - * Of course, if somebody proves me wrong, I'll - * hate intel for all time :-). We'll have to - * be careful and see to reinstating the interrupt - * chips before calling this, though. - */ -void do_tty_interrupt(int tty) -{ - copy_to_cooked(tty_table+tty); -} -/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ -/* - * Wirzenius wrote this portably, Torvalds fucked it up :-) - */ - -#include -#include - -/* we use this so that we can do without the ctype library */ -#define is_digit(c) ((c) >= '0' && (c) <= '9') - -static int skip_atoi(const char **s) -{ - int i=0; - - while (is_digit(**s)) - i = i*10 + *((*s)++) - '0'; - return i; -} - -#define ZEROPAD 1 /* pad with zero */ -#define SIGN 2 /* unsigned/signed long */ -#define PLUS 4 /* show plus */ -#define SPACE 8 /* space if plus */ -#define LEFT 16 /* left justified */ -#define SPECIAL 32 /* 0x */ -#define SMALL 64 /* use 'abcdef' instead of 'ABCDEF' */ - -#define do_div(n,base) ({ \ -int __res; \ -__asm__("divl %4":"=a" (n),"=d" (__res):"0" (n),"1" (0),"r" (base)); \ -__res; }) - -static char * number(char * str, int num, int base, int size, int precision - ,int type) -{ - char c,sign,tmp[36]; - const char *digits="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - int i; - - if (type&SMALL) digits="0123456789abcdefghijklmnopqrstuvwxyz"; - if (type&LEFT) type &= ~ZEROPAD; - if (base<2 || base>36) - return 0; - c = (type & ZEROPAD) ? '0' : ' ' ; - if (type&SIGN && num<0) { - sign='-'; - num = -num; - } else - sign=(type&PLUS) ? '+' : ((type&SPACE) ? ' ' : 0); - if (sign) size--; - if (type&SPECIAL) - if (base==16) size -= 2; - else if (base==8) size--; - i=0; - if (num==0) - tmp[i++]='0'; - else while (num!=0) - tmp[i++]=digits[do_div(num,base)]; - if (i>precision) precision=i; - size -= precision; - if (!(type&(ZEROPAD+LEFT))) - while(size-->0) - *str++ = ' '; - if (sign) - *str++ = sign; - if (type&SPECIAL) - if (base==8) - *str++ = '0'; - else if (base==16) { - *str++ = '0'; - *str++ = digits[33]; - } - if (!(type&LEFT)) - while(size-->0) - *str++ = c; - while(i0) - *str++ = tmp[i]; - while(size-->0) - *str++ = ' '; - return str; -} - -int vsprintf(char *buf, const char *fmt, va_list args) -{ - int len; - int i; - char * str; - char *s; - int *ip; - - int flags; /* flags to number() */ - - int field_width; /* width of output field */ - int precision; /* min. # of digits for integers; max - number of chars for from string */ - int qualifier; /* 'h', 'l', or 'L' for integer fields */ - - for (str=buf ; *fmt ; ++fmt) { - if (*fmt != '%') { - *str++ = *fmt; - continue; - } - - /* process flags */ - flags = 0; - repeat: - ++fmt; /* this also skips first '%' */ - switch (*fmt) { - case '-': flags |= LEFT; goto repeat; - case '+': flags |= PLUS; goto repeat; - case ' ': flags |= SPACE; goto repeat; - case '#': flags |= SPECIAL; goto repeat; - case '0': flags |= ZEROPAD; goto repeat; - } - - /* get field width */ - field_width = -1; - if (is_digit(*fmt)) - field_width = skip_atoi(&fmt); - else if (*fmt == '*') { - /* it's the next argument */ - field_width = va_arg(args, int); - if (field_width < 0) { - field_width = -field_width; - flags |= LEFT; - } - } - - /* get the precision */ - precision = -1; - if (*fmt == '.') { - ++fmt; - if (is_digit(*fmt)) - precision = skip_atoi(&fmt); - else if (*fmt == '*') { - /* it's the next argument */ - precision = va_arg(args, int); - } - if (precision < 0) - precision = 0; - } - - /* get the conversion qualifier */ - qualifier = -1; - if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') { - qualifier = *fmt; - ++fmt; - } - - switch (*fmt) { - case 'c': - if (!(flags & LEFT)) - while (--field_width > 0) - *str++ = ' '; - *str++ = (unsigned char) va_arg(args, int); - while (--field_width > 0) - *str++ = ' '; - break; - - case 's': - s = va_arg(args, char *); - len = strlen(s); - if (precision < 0) - precision = len; - else if (len > precision) - len = precision; - - if (!(flags & LEFT)) - while (len < field_width--) - *str++ = ' '; - for (i = 0; i < len; ++i) - *str++ = *s++; - while (len < field_width--) - *str++ = ' '; - break; - - case 'o': - str = number(str, va_arg(args, unsigned long), 8, - field_width, precision, flags); - break; - - case 'p': - if (field_width == -1) { - field_width = 8; - flags |= ZEROPAD; - } - str = number(str, - (unsigned long) va_arg(args, void *), 16, - field_width, precision, flags); - break; - - case 'x': - flags |= SMALL; - case 'X': - str = number(str, va_arg(args, unsigned long), 16, - field_width, precision, flags); - break; - - case 'd': - case 'i': - flags |= SIGN; - case 'u': - str = number(str, va_arg(args, unsigned long), 10, - field_width, precision, flags); - break; - - case 'n': - ip = va_arg(args, int *); - *ip = (str - buf); - break; - - default: - if (*fmt != '%') - *str++ = '%'; - if (*fmt) - *str++ = *fmt; - else - --fmt; - break; - } - } - *str = '\0'; - return str-buf; -} diff --git a/integration-tests/src/test/resources/kernel26.txt b/integration-tests/src/test/resources/kernel26.txt deleted file mode 100644 index 54016b6..0000000 --- a/integration-tests/src/test/resources/kernel26.txt +++ /dev/null @@ -1,106086 +0,0 @@ -./acct.c -./audit.c -./audit_tree.c -./auditfilter.c -./auditsc.c -./backtracetest.c -./bounds.c -./capability.c -./cgroup.c -./cgroup_debug.c -./compat.c -./configs.c -./cpu.c -./cpuset.c -./delayacct.c -./dma-coherent.c -./dma.c -./exec_domain.c -./exit.c -./extable.c -./fork.c -./futex.c -./futex_compat.c -./hrtimer.c -./irq/autoprobe.c -./irq/chip.c -./irq/devres.c -./irq/handle.c -./irq/manage.c -./irq/migration.c -./irq/proc.c -./irq/resend.c -./irq/spurious.c -./itimer.c -./kallsyms.c -./kexec.c -./kfifo.c -./kgdb.c -./kmod.c -./kprobes.c -./ksysfs.c -./kthread.c -./latencytop.c -./lockdep.c -./lockdep_proc.c -./marker.c -./module.c -./mutex-debug.c -./mutex.c -./notifier.c -./ns_cgroup.c -./nsproxy.c -./panic.c -./params.c -./pid.c -./pid_namespace.c -./pm_qos_params.c -./posix-cpu-timers.c -./posix-timers.c -./power/console.c -./power/disk.c -./power/main.c -./power/poweroff.c -./power/process.c -./power/snapshot.c -./power/swap.c -./power/swsusp.c -./power/user.c -./printk.c -./profile.c -./ptrace.c -./rcuclassic.c -./rcupdate.c -./rcupreempt.c -./rcupreempt_trace.c -./rcutorture.c -./relay.c -./res_counter.c -./resource.c -./rtmutex-debug.c -./rtmutex-tester.c -./rtmutex.c -./rwsem.c -./sched.c -./sched_clock.c -./sched_cpupri.c -./sched_debug.c -./sched_fair.c -./sched_idletask.c -./sched_rt.c -./seccomp.c -./semaphore.c -./signal.c -./smp.c -./softirq.c -./softlockup.c -./spinlock.c -./srcu.c -./stacktrace.c -./stop_machine.c -./sys.c -./sys_ni.c -./sysctl.c -./sysctl_check.c -./taskstats.c -./test_kprobes.c -./time/clockevents.c -./time/clocksource.c -./time/jiffies.c -./time/ntp.c -./time/tick-broadcast.c -./time/tick-common.c -./time/tick-oneshot.c -./time/tick-sched.c -./time/timekeeping.c -./time/timer_list.c -./time/timer_stats.c -./time.c -./timer.c -./trace/ftrace.c -./trace/trace.c -./trace/trace_functions.c -./trace/trace_irqsoff.c -./trace/trace_mmiotrace.c -./trace/trace_sched_switch.c -./trace/trace_sched_wakeup.c -./trace/trace_selftest.c -./trace/trace_selftest_dynamic.c -./trace/trace_sysprof.c -./tsacct.c -./uid16.c -./user.c -./user_namespace.c -./utsname.c -./utsname_sysctl.c -./wait.c -./workqueue.c -/* - * linux/kernel/acct.c - * - * BSD Process Accounting for Linux - * - * Author: Marco van Wieringen - * - * Some code based on ideas and code from: - * Thomas K. Dyas - * - * This file implements BSD-style process accounting. Whenever any - * process exits, an accounting record of type "struct acct" is - * written to the file specified with the acct() system call. It is - * up to user-level programs to do useful things with the accounting - * log. The kernel just provides the raw accounting information. - * - * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. - * - * Plugged two leaks. 1) It didn't return acct_file into the free_filps if - * the file happened to be read-only. 2) If the accounting was suspended - * due to the lack of space it happily allowed to reopen it and completely - * lost the old acct_file. 3/10/98, Al Viro. - * - * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). - * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. - * - * Fixed a nasty interaction with with sys_umount(). If the accointing - * was suspeneded we failed to stop it on umount(). Messy. - * Another one: remount to readonly didn't stop accounting. - * Question: what should we do if we have CAP_SYS_ADMIN but not - * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY - * unless we are messing with the root. In that case we are getting a - * real mess with do_remount_sb(). 9/11/98, AV. - * - * Fixed a bunch of races (and pair of leaks). Probably not the best way, - * but this one obviously doesn't introduce deadlocks. Later. BTW, found - * one race (and leak) in BSD implementation. - * OK, that's better. ANOTHER race and leak in BSD variant. There always - * is one more bug... 10/11/98, AV. - * - * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks - * a struct file opened for write. Fixed. 2/6/2000, AV. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* sector_div */ -#include - -/* - * These constants control the amount of freespace that suspend and - * resume the process accounting system, and the time delay between - * each check. - * Turned into sysctl-controllable parameters. AV, 12/11/98 - */ - -int acct_parm[3] = {4, 2, 30}; -#define RESUME (acct_parm[0]) /* >foo% free space - resume */ -#define SUSPEND (acct_parm[1]) /* needcheck = 1; -} - -/* - * Check the amount of free space and suspend/resume accordingly. - */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) -{ - struct kstatfs sbuf; - int res; - int act; - sector_t resume; - sector_t suspend; - - spin_lock(&acct_lock); - res = acct->active; - if (!file || !acct->needcheck) - goto out; - spin_unlock(&acct_lock); - - /* May block */ - if (vfs_statfs(file->f_path.dentry, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - sector_div(suspend, 100); - sector_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act>0; - goto out; - } - - if (acct->active) { - if (act < 0) { - acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); - } - } else { - if (act > 0) { - acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); - } - } - - del_timer(&acct->timer); - acct->needcheck = 0; - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); - res = acct->active; -out: - spin_unlock(&acct_lock); - return res; -} - -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) -{ - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - del_timer(&acct->timer); - acct->active = 0; - acct->needcheck = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); - } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = 0; - acct->active = 1; - list_add(&acct->list, &acct_list); - /* It's been deleted if it was used before so this is safe */ - setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); - } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); - spin_lock(&acct_lock); - } -} - -static int acct_on(char *name) -{ - struct file *file; - struct vfsmount *mnt; - int error; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; - - /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { - filp_close(file, NULL); - return -EACCES; - } - - if (!file->f_op->write) { - filp_close(file, NULL); - return -EIO; - } - - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } - } - - error = security_acct(file); - if (error) { - kfree(acct); - filp_close(file, NULL); - return error; - } - - spin_lock(&acct_lock); - if (ns->bacct == NULL) { - ns->bacct = acct; - acct = NULL; - } - - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - - return 0; -} - -/** - * sys_acct - enable/disable process accounting - * @name: file name for accounting records or NULL to shutdown accounting - * - * Returns 0 for success or negative errno values for failure. - * - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. - */ -SYSCALL_DEFINE1(acct, const char __user *, name) -{ - int error; - - if (!capable(CAP_SYS_PACCT)) - return -EPERM; - - if (name) { - char *tmp = getname(name); - if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); - error = acct_on(tmp); - putname(tmp); - } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - error = security_acct(NULL); - if (!error) { - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - } - } - return error; -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -void acct_exit_ns(struct pid_namespace *ns) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); - acct = ns->bacct; - if (acct != NULL) { - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - - kfree(acct); - } - spin_unlock(&acct_lock); -} - -/* - * encode an unsigned long into a comp_t - * - * This routine has been adopted from the encode_comp_t() function in - * the kern_acct.c file of the FreeBSD operating system. The encoding - * is a 13-bit fraction with a 3-bit (base 8) exponent. - */ - -#define MANTSIZE 13 /* 13 bit mantissa. */ -#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ -#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ - -static comp_t encode_comp_t(unsigned long value) -{ - int exp, rnd; - - exp = rnd = 0; - while (value > MAXFRACT) { - rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ - value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ - exp++; - } - - /* - * If we need to round up, do it (and handle overflow correctly). - */ - if (rnd && (++value > MAXFRACT)) { - value >>= EXPSIZE; - exp++; - } - - /* - * Clean it up and polish it off. - */ - exp <<= MANTSIZE; /* Shift the exponent into place */ - exp += value; /* and add on the mantissa. */ - return exp; -} - -#if ACCT_VERSION==1 || ACCT_VERSION==2 -/* - * encode an u64 into a comp2_t (24 bits) - * - * Format: 5 bit base 2 exponent, 20 bits mantissa. - * The leading bit of the mantissa is not stored, but implied for - * non-zero exponents. - * Largest encodable value is 50 bits. - */ - -#define MANTSIZE2 20 /* 20 bit mantissa. */ -#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ -#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < (MAXFRACT2>>1)); - rnd = 0; - while (value > MAXFRACT2) { - rnd = value & 1; - value >>= 1; - exp++; - } - - /* - * If we need to round up, do it (and handle overflow correctly). - */ - if (rnd && (++value > MAXFRACT2)) { - value >>= 1; - exp++; - } - - if (exp > MAXEXP2) { - /* Overflow. Return largest representable number instead. */ - return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; - } else { - return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); - } -} -#endif - -#if ACCT_VERSION==3 -/* - * encode an u64 into a 32 bit IEEE float - */ -static u32 encode_float(u64 value) -{ - unsigned exp = 190; - unsigned u; - - if (value==0) return 0; - while ((s64)value > 0){ - value <<= 1; - exp--; - } - u = (u32)(value >> 40) & 0x7fffffu; - return u | (exp << 23); -} -#endif - -/* - * Write an accounting entry for an exiting process - * - * The acct_process() call is the workhorse of the process - * accounting system. The struct acct is built here and then written - * into the accounting file. This function should only be called from - * do_exit() or when switching to a different output file. - */ - -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) -{ - struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - mm_segment_t fs; - unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; - struct tty_struct *tty; - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - return; - - /* - * Fill the accounting struct with the needed info as recorded - * by the different kernel functions. - */ - memset((caddr_t)&ac, 0, sizeof(acct_t)); - - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); - - /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; - /* convert nsec -> AHZ */ - elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); -#else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 - { - /* new enlarged etime field */ - comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; - } -#endif - do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; - /* we really need to bite the bullet and change layout */ - ac.ac_uid = current->uid; - ac.ac_gid = current->gid; -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 - /* backward-compatible 16 bit fields */ - ac.ac_uid16 = current->uid; - ac.ac_gid16 = current->gid; -#endif -#if ACCT_VERSION==3 - ac.ac_pid = task_tgid_nr_ns(current, ns); - rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); - rcu_read_unlock(); -#endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); - - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); -} - -/** - * acct_init_pacct - initialize a new pacct_struct - * @pacct: per-process accounting info struct to initialize - */ -void acct_init_pacct(struct pacct_struct *pacct) -{ - memset(pacct, 0, sizeof(struct pacct_struct)); - pacct->ac_utime = pacct->ac_stime = cputime_zero; -} - -/** - * acct_collect - collect accounting information into pacct_struct - * @exitcode: task exit code - * @group_dead: not 0, if this thread is the last one in the process. - */ -void acct_collect(long exitcode, int group_dead) -{ - struct pacct_struct *pacct = ¤t->signal->pacct; - unsigned long vsize = 0; - - if (group_dead && current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - - spin_lock_irq(¤t->sighand->siglock); - if (group_dead) - pacct->ac_mem = vsize / 1024; - if (thread_group_leader(current)) { - pacct->ac_exitcode = exitcode; - if (current->flags & PF_FORKNOEXEC) - pacct->ac_flag |= AFORK; - } - if (current->flags & PF_SUPERPRIV) - pacct->ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - pacct->ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - pacct->ac_flag |= AXSIG; - pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); - pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); - pacct->ac_minflt += current->min_flt; - pacct->ac_majflt += current->maj_flt; - spin_unlock_irq(¤t->sighand->siglock); -} - -static void acct_process_in_ns(struct pid_namespace *ns) -{ - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - return; - } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); -} - -/** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. - * - * handles process accounting for an exiting task - */ -void acct_process(void) -{ - struct pid_namespace *ns; - - /* - * This loop is safe lockless, since current is still - * alive and holds its namespace, which in turn holds - * its parent. - */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); -} -/* audit.c -- Auditing support - * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. - * System-call specific features have moved to auditsc.c - * - * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Written by Rickard E. (Rik) Faith - * - * Goals: 1) Integrate fully with Security Modules. - * 2) Minimal run-time overhead: - * a) Minimal when syscall auditing is disabled (audit_enable=0). - * b) Small when syscall auditing is enabled and no audit record - * is generated (defer as much work as possible to record - * generation time): - * i) context is allocated, - * ii) names from getname are stored without a copy, and - * iii) inode information stored from path_lookup. - * 3) Ability to disable syscall auditing at boot time (audit=0). - * 4) Usable by other parts of the kernel (if audit_log* is called, - * then a syscall record will be generated automatically for the - * current syscall). - * 5) Netlink interface to user-space. - * 6) Support low-overhead kernel-based filtering to minimize the - * information that must be passed to user-space. - * - * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "audit.h" - -/* No auditing will take place until audit_initialized != 0. - * (Initialization happens after skb_init is called.) */ -static int audit_initialized; - -#define AUDIT_OFF 0 -#define AUDIT_ON 1 -#define AUDIT_LOCKED 2 -int audit_enabled; -int audit_ever_enabled; - -/* Default state when kernel boots without any parameters. */ -static int audit_default; - -/* If auditing cannot proceed, audit_failure selects what happens. */ -static int audit_failure = AUDIT_FAIL_PRINTK; - -/* - * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. - */ -int audit_pid; -static int audit_nlk_pid; - -/* If audit_rate_limit is non-zero, limit the rate of sending audit records - * to that number per second. This prevents DoS attacks, but results in - * audit records being dropped. */ -static int audit_rate_limit; - -/* Number of outstanding audit_buffers allowed. */ -static int audit_backlog_limit = 64; -static int audit_backlog_wait_time = 60 * HZ; -static int audit_backlog_wait_overflow = 0; - -/* The identity of the user shutting down the audit system. */ -uid_t audit_sig_uid = -1; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; - -/* Records can be lost in several ways: - 0) [suppressed in audit_alloc] - 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] - 2) out of memory in audit_log_move [alloc_skb] - 3) suppressed due to audit_rate_limit - 4) suppressed due to audit_backlog_limit -*/ -static atomic_t audit_lost = ATOMIC_INIT(0); - -/* The netlink socket. */ -static struct sock *audit_sock; - -/* Inotify handle. */ -struct inotify_handle *audit_ih; - -/* Hash for inode-based rules */ -struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; - -/* The audit_freelist is a list of pre-allocated audit buffers (if more - * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of - * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count; -static LIST_HEAD(audit_freelist); - -static struct sk_buff_head audit_skb_queue; -/* queue of skbs to send to auditd when/if it comes back */ -static struct sk_buff_head audit_skb_hold_queue; -static struct task_struct *kauditd_task; -static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); -static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); - -/* Serialize requests from userspace. */ -static DEFINE_MUTEX(audit_cmd_mutex); - -/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting - * audit records. Since printk uses a 1024 byte buffer, this buffer - * should be at least that large. */ -#define AUDIT_BUFSIZ 1024 - -/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the - * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ -#define AUDIT_MAXFREE (2*NR_CPUS) - -/* The audit_buffer is used when formatting an audit record. The caller - * locks briefly to get the record off the freelist or to allocate the - * buffer, and locks briefly to send the buffer to the netlink layer or - * to place it on a transmit queue. Multiple audit_buffers can be in - * use simultaneously. */ -struct audit_buffer { - struct list_head list; - struct sk_buff *skb; /* formatted skb ready to send */ - struct audit_context *ctx; /* NULL or associated context */ - gfp_t gfp_mask; -}; - -struct audit_reply { - int pid; - struct sk_buff *skb; -}; - -static void audit_set_pid(struct audit_buffer *ab, pid_t pid) -{ - if (ab) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_pid = pid; - } -} - -void audit_panic(const char *message) -{ - switch (audit_failure) - { - case AUDIT_FAIL_SILENT: - break; - case AUDIT_FAIL_PRINTK: - if (printk_ratelimit()) - printk(KERN_ERR "audit: %s\n", message); - break; - case AUDIT_FAIL_PANIC: - /* test audit_pid since printk is always losey, why bother? */ - if (audit_pid) - panic("audit: %s\n", message); - break; - } -} - -static inline int audit_rate_check(void) -{ - static unsigned long last_check = 0; - static int messages = 0; - static DEFINE_SPINLOCK(lock); - unsigned long flags; - unsigned long now; - unsigned long elapsed; - int retval = 0; - - if (!audit_rate_limit) return 1; - - spin_lock_irqsave(&lock, flags); - if (++messages < audit_rate_limit) { - retval = 1; - } else { - now = jiffies; - elapsed = now - last_check; - if (elapsed > HZ) { - last_check = now; - messages = 0; - retval = 1; - } - } - spin_unlock_irqrestore(&lock, flags); - - return retval; -} - -/** - * audit_log_lost - conditionally log lost audit message event - * @message: the message stating reason for lost audit message - * - * Emit at least 1 message per second, even if audit_rate_check is - * throttling. - * Always increment the lost messages counter. -*/ -void audit_log_lost(const char *message) -{ - static unsigned long last_msg = 0; - static DEFINE_SPINLOCK(lock); - unsigned long flags; - unsigned long now; - int print; - - atomic_inc(&audit_lost); - - print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); - - if (!print) { - spin_lock_irqsave(&lock, flags); - now = jiffies; - if (now - last_msg > HZ) { - print = 1; - last_msg = now; - } - spin_unlock_irqrestore(&lock, flags); - } - - if (print) { - if (printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_lost=%d audit_rate_limit=%d " - "audit_backlog_limit=%d\n", - atomic_read(&audit_lost), - audit_rate_limit, - audit_backlog_limit); - audit_panic(message); - } -} - -static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, - int allow_changes) -{ - struct audit_buffer *ab; - int rc = 0; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) { - audit_log_format(ab, " sid=%u", sid); - allow_changes = 0; /* Something weird, deny request */ - } else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " res=%d", allow_changes); - audit_log_end(ab); - return rc; -} - -static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, - u32 sid) -{ - int allow_changes, rc = 0, old = *to_change; - - /* check if we are locked */ - if (audit_enabled == AUDIT_LOCKED) - allow_changes = 0; - else - allow_changes = 1; - - if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, loginuid, - sessionid, sid, allow_changes); - if (rc) - allow_changes = 0; - } - - /* If we are allowed, make the change */ - if (allow_changes == 1) - *to_change = new; - /* Not allowed, update reason */ - else if (rc == 0) - rc = -EPERM; - return rc; -} - -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, - u32 sid) -{ - return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sessionid, sid); -} - -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, - u32 sid) -{ - return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sessionid, sid); -} - -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) -{ - int rc; - if (state < AUDIT_OFF || state > AUDIT_LOCKED) - return -EINVAL; - - rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sessionid, sid); - - if (!rc) - audit_ever_enabled |= !!state; - - return rc; -} - -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) -{ - if (state != AUDIT_FAIL_SILENT - && state != AUDIT_FAIL_PRINTK - && state != AUDIT_FAIL_PANIC) - return -EINVAL; - - return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sessionid, sid); -} - -/* - * Queue skbs to be sent to auditd when/if it comes back. These skbs should - * already have been sent via prink/syslog and so if these messages are dropped - * it is not a huge concern since we already passed the audit_log_lost() - * notification and stuff. This is just nice to get audit messages during - * boot before auditd is running or messages generated while auditd is stopped. - * This only holds messages is audit_default is set, aka booting with audit=1 - * or building your kernel that way. - */ -static void audit_hold_skb(struct sk_buff *skb) -{ - if (audit_default && - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) - skb_queue_tail(&audit_skb_hold_queue, skb); - else - kfree_skb(skb); -} - -static void kauditd_send_skb(struct sk_buff *skb) -{ - int err; - /* take a reference in case we can't send it and we want to hold it */ - skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); - if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ - printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); - audit_pid = 0; - /* we might get lucky and get this in the next auditd */ - audit_hold_skb(skb); - } else - /* drop the extra reference if sent ok */ - kfree_skb(skb); -} - -static int kauditd_thread(void *dummy) -{ - struct sk_buff *skb; - - set_freezable(); - while (!kthread_should_stop()) { - /* - * if auditd just started drain the queue of messages already - * sent to syslog/printk. remember loss here is ok. we already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. - * - * if you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. - */ - if (audit_default && audit_pid) { - skb = skb_dequeue(&audit_skb_hold_queue); - if (unlikely(skb)) { - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } - } - } - - skb = skb_dequeue(&audit_skb_queue); - wake_up(&audit_backlog_wait); - if (skb) { - if (audit_pid) - kauditd_send_skb(skb); - else { - if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); - else - audit_log_lost("printk limit exceeded\n"); - - audit_hold_skb(skb); - } - } else { - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); - } - } - return 0; -} - -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ - struct task_struct *tsk; - int err; - - read_lock(&tasklist_lock); - tsk = find_task_by_vpid(pid); - err = -ESRCH; - if (!tsk) - goto out; - err = 0; - - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->audit_tty) - err = -EPERM; - spin_unlock_irq(&tsk->sighand->siglock); - if (err) - goto out; - - tty_audit_push_task(tsk, loginuid, sessionid); -out: - read_unlock(&tasklist_lock); - return err; -} - -int audit_send_list(void *_dest) -{ - struct audit_netlink_list *dest = _dest; - int pid = dest->pid; - struct sk_buff *skb; - - /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); - - while ((skb = __skb_dequeue(&dest->q)) != NULL) - netlink_unicast(audit_sock, skb, pid, 0); - - kfree(dest); - - return 0; -} - -#ifdef CONFIG_AUDIT_TREE -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - audit_prune_trees(); - mutex_unlock(&audit_cmd_mutex); - return 0; -} - -void audit_schedule_prune(void) -{ - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); -} -#endif - -struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, void *payload, int size) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = alloc_skb(len, GFP_KERNEL); - if (!skb) - return NULL; - - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - return skb; - -nlmsg_failure: /* Used by NLMSG_PUT */ - if (skb) - kfree_skb(skb); - return NULL; -} - -static int audit_send_reply_thread(void *arg) -{ - struct audit_reply *reply = (struct audit_reply *)arg; - - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); - - /* Ignore failure. It'll only happen if the sender goes away, - because our timeout is set to infinite. */ - netlink_unicast(audit_sock, reply->skb, reply->pid, 0); - kfree(reply); - return 0; -} -/** - * audit_send_reply - send an audit reply message via netlink - * @pid: process id to send reply to - * @seq: sequence number - * @type: audit message type - * @done: done (last) flag - * @multi: multi-part message flag - * @payload: payload data - * @size: payload size - * - * Allocates an skb, builds the netlink message, and sends it to the pid. - * No failure notifications. - */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size) -{ - struct sk_buff *skb; - struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); - - if (!reply) - return; - - skb = audit_make_reply(pid, seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->pid = pid; - reply->skb = skb; - - tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); -} - -/* - * Check for appropriate CAP_AUDIT_ capabilities on incoming audit - * control messages. - */ -static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) -{ - int err = 0; - - switch (msg_type) { - case AUDIT_GET: - case AUDIT_LIST: - case AUDIT_LIST_RULES: - case AUDIT_SET: - case AUDIT_ADD: - case AUDIT_ADD_RULE: - case AUDIT_DEL: - case AUDIT_DEL_RULE: - case AUDIT_SIGNAL_INFO: - case AUDIT_TTY_GET: - case AUDIT_TTY_SET: - case AUDIT_TRIM: - case AUDIT_MAKE_EQUIV: - if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) - err = -EPERM; - break; - case AUDIT_USER: - case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) - err = -EPERM; - break; - default: /* bad msg */ - err = -EINVAL; - } - - return err; -} - -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 ses, - u32 sid) -{ - int rc = 0; - char *ctx = NULL; - u32 len; - - if (!audit_enabled) { - *ab = NULL; - return rc; - } - - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", - pid, uid, auid, ses); - if (sid) { - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) - audit_log_format(*ab, " ssid=%u", sid); - else { - audit_log_format(*ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - return rc; -} - -static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - u32 uid, pid, seq, sid; - void *data; - struct audit_status *status_get, status_set; - int err; - struct audit_buffer *ab; - u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ - u32 sessionid; - struct audit_sig_info *sig_data; - char *ctx = NULL; - u32 len; - - err = audit_netlink_ok(skb, msg_type); - if (err) - return err; - - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; - loginuid = NETLINK_CB(skb).loginuid; - sessionid = NETLINK_CB(skb).sessionid; - sid = NETLINK_CB(skb).sid; - seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); - - switch (msg_type) { - case AUDIT_GET: - status_set.enabled = audit_enabled; - status_set.failure = audit_failure; - status_set.pid = audit_pid; - status_set.rate_limit = audit_rate_limit; - status_set.backlog_limit = audit_backlog_limit; - status_set.lost = atomic_read(&audit_lost); - status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, - &status_set, sizeof(status_set)); - break; - case AUDIT_SET: - if (nlh->nlmsg_len < sizeof(struct audit_status)) - return -EINVAL; - status_get = (struct audit_status *)data; - if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_PID) { - int new_pid = status_get->pid; - - if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - audit_pid, loginuid, - sessionid, sid, 1); - - audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; - } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sessionid, sid); - break; - case AUDIT_USER: - case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!audit_enabled && msg_type != AUDIT_USER_AVC) - return 0; - - err = audit_filter_user(&NETLINK_CB(skb)); - if (err == 1) { - err = 0; - if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid, - sessionid); - if (err) - break; - } - audit_log_common_recv_msg(&ab, msg_type, pid, uid, - loginuid, sessionid, sid); - - if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.1024s'", - (char *)data); - else { - int size; - - audit_log_format(ab, " msg="); - size = nlmsg_len(nlh); - audit_log_n_untrustedstring(ab, data, size); - } - audit_set_pid(ab, pid); - audit_log_end(ab); - } - break; - case AUDIT_ADD: - case AUDIT_DEL: - if (nlmsg_len(nlh) < sizeof(struct audit_rule)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; - case AUDIT_ADD_RULE: - case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; - case AUDIT_TRIM: - audit_trim_trees(); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " op=trim res=1"); - audit_log_end(ab); - break; - case AUDIT_MAKE_EQUIV: { - void *bufp = data; - u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); - char *old, *new; - - err = -EINVAL; - if (msglen < 2 * sizeof(u32)) - break; - memcpy(sizes, bufp, 2 * sizeof(u32)); - bufp += 2 * sizeof(u32); - msglen -= 2 * sizeof(u32); - old = audit_unpack_string(&bufp, &msglen, sizes[0]); - if (IS_ERR(old)) { - err = PTR_ERR(old); - break; - } - new = audit_unpack_string(&bufp, &msglen, sizes[1]); - if (IS_ERR(new)) { - err = PTR_ERR(new); - kfree(old); - break; - } - /* OK, here comes... */ - err = audit_tag_tree(old, new); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " op=make_equiv old="); - audit_log_untrustedstring(ab, old); - audit_log_format(ab, " new="); - audit_log_untrustedstring(ab, new); - audit_log_format(ab, " res=%d", !err); - audit_log_end(ab); - kfree(old); - kfree(new); - break; - } - case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; - sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); - if (!sig_data) { - security_release_secctx(ctx, len); - return -ENOMEM; - } - sig_data->uid = audit_sig_uid; - sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); - kfree(sig_data); - break; - case AUDIT_TTY_GET: { - struct audit_tty_status s; - struct task_struct *tsk; - - read_lock(&tasklist_lock); - tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); - break; - } - case AUDIT_TTY_SET: { - struct audit_tty_status *s; - struct task_struct *tsk; - - if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) - return -EINVAL; - s = data; - if (s->enabled != 0 && s->enabled != 1) - return -EINVAL; - read_lock(&tasklist_lock); - tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); - tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - break; - } - default: - err = -EINVAL; - break; - } - - return err < 0 ? err : 0; -} - -/* - * Get message from skb (based on rtnetlink_rcv_skb). Each message is - * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. - */ -static void audit_receive_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - u32 rlen; - - while (skb->len >= NLMSG_SPACE(0)) { - nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if ((err = audit_receive_msg(skb, nlh))) { - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } -} - -/* Receive messages from netlink socket. */ -static void audit_receive(struct sk_buff *skb) -{ - mutex_lock(&audit_cmd_mutex); - audit_receive_skb(skb); - mutex_unlock(&audit_cmd_mutex); -} - -#ifdef CONFIG_AUDITSYSCALL -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, -}; -#endif - -/* Initialize audit support at boot time. */ -static int __init audit_init(void) -{ - int i; - - printk(KERN_INFO "audit: initializing netlink socket (%s)\n", - audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); - if (!audit_sock) - audit_panic("cannot initialize netlink socket"); - else - audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - - skb_queue_head_init(&audit_skb_queue); - skb_queue_head_init(&audit_skb_hold_queue); - audit_initialized = 1; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; - - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); - -#ifdef CONFIG_AUDITSYSCALL - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); -#endif - - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) - INIT_LIST_HEAD(&audit_inode_hash[i]); - - return 0; -} -__initcall(audit_init); - -/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ -static int __init audit_enable(char *str) -{ - audit_default = !!simple_strtol(str, NULL, 0); - printk(KERN_INFO "audit: %s%s\n", - audit_default ? "enabled" : "disabled", - audit_initialized ? "" : " (after initialization)"); - if (audit_initialized) { - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; - } - return 1; -} - -__setup("audit=", audit_enable); - -static void audit_buffer_free(struct audit_buffer *ab) -{ - unsigned long flags; - - if (!ab) - return; - - if (ab->skb) - kfree_skb(ab->skb); - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else { - audit_freelist_count++; - list_add(&ab->list, &audit_freelist); - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); -} - -static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - gfp_t gfp_mask, int type) -{ - unsigned long flags; - struct audit_buffer *ab = NULL; - struct nlmsghdr *nlh; - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) { - ab = kmalloc(sizeof(*ab), gfp_mask); - if (!ab) - goto err; - } - - ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); - if (!ab->skb) - goto err; - - ab->ctx = ctx; - ab->gfp_mask = gfp_mask; - nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); - nlh->nlmsg_type = type; - nlh->nlmsg_flags = 0; - nlh->nlmsg_pid = 0; - nlh->nlmsg_seq = 0; - return ab; -err: - audit_buffer_free(ab); - return NULL; -} - -/** - * audit_serial - compute a serial number for the audit record - * - * Compute a serial number for the audit record. Audit records are - * written to user-space as soon as they are generated, so a complete - * audit record may be written in several pieces. The timestamp of the - * record and this serial number are used by the user-space tools to - * determine which pieces belong to the same audit record. The - * (timestamp,serial) tuple is unique for each syscall and is live from - * syscall entry to syscall exit. - * - * NOTE: Another possibility is to store the formatted records off the - * audit context (for those records that have a context), and emit them - * all at syscall exit. However, this could delay the reporting of - * significant errors until syscall exit (or never, if the system - * halts). - */ -unsigned int audit_serial(void) -{ - static DEFINE_SPINLOCK(serial_lock); - static unsigned int serial = 0; - - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&serial_lock, flags); - do { - ret = ++serial; - } while (unlikely(!ret)); - spin_unlock_irqrestore(&serial_lock, flags); - - return ret; -} - -static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) -{ - if (ctx) - auditsc_get_stamp(ctx, t, serial); - else { - *t = CURRENT_TIME; - *serial = audit_serial(); - } -} - -/* Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, tsk - * should be NULL. */ - -/** - * audit_log_start - obtain an audit buffer - * @ctx: audit_context (may be NULL) - * @gfp_mask: type of allocation - * @type: audit message type - * - * Returns audit_buffer pointer on success or NULL on error. - * - * Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the task (ctx) is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, then - * task context (ctx) should be NULL. - */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, - int type) -{ - struct audit_buffer *ab = NULL; - struct timespec t; - unsigned int uninitialized_var(serial); - int reserve; - unsigned long timeout_start = jiffies; - - if (!audit_initialized) - return NULL; - - if (unlikely(audit_filter_type(type))) - return NULL; - - if (gfp_mask & __GFP_WAIT) - reserve = 0; - else - reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ - - while (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time - && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { - - /* Wait for auditd to drain the queue a little */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); - continue; - } - if (audit_rate_check() && printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_backlog=%d > " - "audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), - audit_backlog_limit); - audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; - wake_up(&audit_backlog_wait); - return NULL; - } - - ab = audit_buffer_alloc(ctx, gfp_mask, type); - if (!ab) { - audit_log_lost("out of memory in audit_log_start"); - return NULL; - } - - audit_get_stamp(ab->ctx, &t, &serial); - - audit_log_format(ab, "audit(%lu.%03lu:%u): ", - t.tv_sec, t.tv_nsec/1000000, serial); - return ab; -} - -/** - * audit_expand - expand skb in the audit buffer - * @ab: audit_buffer - * @extra: space to add at tail of the skb - * - * Returns 0 (no space) on failed expansion, or available space if - * successful. - */ -static inline int audit_expand(struct audit_buffer *ab, int extra) -{ - struct sk_buff *skb = ab->skb; - int oldtail = skb_tailroom(skb); - int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); - int newtail = skb_tailroom(skb); - - if (ret < 0) { - audit_log_lost("out of memory in audit_expand"); - return 0; - } - - skb->truesize += newtail - oldtail; - return newtail; -} - -/* - * Format an audit message into the audit buffer. If there isn't enough - * room in the audit buffer, more room will be allocated and vsnprint - * will be called a second time. Currently, we assume that a printk - * can't format message larger than 1024 bytes, so we don't either. - */ -static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, - va_list args) -{ - int len, avail; - struct sk_buff *skb; - va_list args2; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - if (avail == 0) { - avail = audit_expand(ab, AUDIT_BUFSIZ); - if (!avail) - goto out; - } - va_copy(args2, args); - len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); - if (len >= avail) { - /* The printk buffer is 1024 bytes long, so if we get - * here and AUDIT_BUFSIZ is at least 1024, then we can - * log everything that printk could have logged. */ - avail = audit_expand(ab, - max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); - if (!avail) - goto out; - len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); - } - va_end(args2); - if (len > 0) - skb_put(skb, len); -out: - return; -} - -/** - * audit_log_format - format a message into the audit buffer. - * @ab: audit_buffer - * @fmt: format string - * @...: optional parameters matching @fmt string - * - * All the work is done in audit_log_vformat. - */ -void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) -{ - va_list args; - - if (!ab) - return; - va_start(args, fmt); - audit_log_vformat(ab, fmt, args); - va_end(args); -} - -/** - * audit_log_hex - convert a buffer to hex and append it to the audit skb - * @ab: the audit_buffer - * @buf: buffer to convert to hex - * @len: length of @buf to be converted - * - * No return value; failure to expand is silently ignored. - * - * This function will take the passed buf and convert it into a string of - * ascii hex digits. The new string is placed onto the skb. - */ -void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, - size_t len) -{ - int i, avail, new_len; - unsigned char *ptr; - struct sk_buff *skb; - static const unsigned char *hex = "0123456789ABCDEF"; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - new_len = len<<1; - if (new_len >= avail) { - /* Round the buffer request up to the next multiple */ - new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); - avail = audit_expand(ab, new_len); - if (!avail) - return; - } - - ptr = skb_tail_pointer(skb); - for (i=0; i>4]; /* Upper nibble */ - *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ - } - *ptr = 0; - skb_put(skb, len << 1); /* new string is twice the old string */ -} - -/* - * Format a string of no more than slen characters into the audit buffer, - * enclosed in quote marks. - */ -void audit_log_n_string(struct audit_buffer *ab, const char *string, - size_t slen) -{ - int avail, new_len; - unsigned char *ptr; - struct sk_buff *skb; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - new_len = slen + 3; /* enclosing quotes + null terminator */ - if (new_len > avail) { - avail = audit_expand(ab, new_len); - if (!avail) - return; - } - ptr = skb_tail_pointer(skb); - *ptr++ = '"'; - memcpy(ptr, string, slen); - ptr += slen; - *ptr++ = '"'; - *ptr = 0; - skb_put(skb, slen + 2); /* don't include null terminator */ -} - -/** - * audit_string_contains_control - does a string need to be logged in hex - * @string: string to be checked - * @len: max length of the string to check - */ -int audit_string_contains_control(const char *string, size_t len) -{ - const unsigned char *p; - for (p = string; p < (const unsigned char *)string + len && *p; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7e) - return 1; - } - return 0; -} - -/** - * audit_log_n_untrustedstring - log a string that may contain random characters - * @ab: audit_buffer - * @len: length of string (not including trailing null) - * @string: string to be logged - * - * This code will escape a string that is passed to it if the string - * contains a control character, unprintable character, double quote mark, - * or a space. Unescaped strings will start and end with a double quote mark. - * Strings that are escaped are printed in hex (2 digits per char). - * - * The caller specifies the number of characters in the string to log, which may - * or may not be the entire string. - */ -void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, - size_t len) -{ - if (audit_string_contains_control(string, len)) - audit_log_n_hex(ab, string, len); - else - audit_log_n_string(ab, string, len); -} - -/** - * audit_log_untrustedstring - log a string that may contain random characters - * @ab: audit_buffer - * @string: string to be logged - * - * Same as audit_log_n_untrustedstring(), except that strlen is used to - * determine string length. - */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) -{ - audit_log_n_untrustedstring(ab, string, strlen(string)); -} - -/* This is a helper-function to print the escaped d_path */ -void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) -{ - char *p, *pathname; - - if (prefix) - audit_log_format(ab, " %s", prefix); - - /* We will allow 11 spaces for ' (deleted)' to be appended */ - pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); - if (!pathname) { - audit_log_format(ab, ""); - return; - } - p = d_path(path, pathname, PATH_MAX+11); - if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ - /* FIXME: can we save some information here? */ - audit_log_format(ab, ""); - } else - audit_log_untrustedstring(ab, p); - kfree(pathname); -} - -/** - * audit_log_end - end one audit record - * @ab: the audit_buffer - * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context. May be called in - * any context. - */ -void audit_log_end(struct audit_buffer *ab) -{ - if (!ab) - return; - if (!audit_rate_check()) { - audit_log_lost("rate limit exceeded"); - } else { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); - - if (audit_pid) { - skb_queue_tail(&audit_skb_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); - } else { - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); - } - audit_hold_skb(ab->skb); - } - ab->skb = NULL; - } - audit_buffer_free(ab); -} - -/** - * audit_log - Log an audit record - * @ctx: audit context - * @gfp_mask: type of allocation - * @type: audit message type - * @fmt: format string to use - * @...: variable parameters matching the format string - * - * This is a convenience function that calls audit_log_start, - * audit_log_vformat, and audit_log_end. It may be called - * in any context. - */ -void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, - const char *fmt, ...) -{ - struct audit_buffer *ab; - va_list args; - - ab = audit_log_start(ctx, gfp_mask, type); - if (ab) { - va_start(args, fmt); - audit_log_vformat(ab, fmt, args); - va_end(args); - audit_log_end(ab); - } -} - -EXPORT_SYMBOL(audit_log_start); -EXPORT_SYMBOL(audit_log_end); -EXPORT_SYMBOL(audit_log_format); -EXPORT_SYMBOL(audit_log); -#include "audit.h" -#include -#include -#include - -struct audit_tree; -struct audit_chunk; - -struct audit_tree { - atomic_t count; - int goner; - struct audit_chunk *root; - struct list_head chunks; - struct list_head rules; - struct list_head list; - struct list_head same_root; - struct rcu_head head; - char pathname[]; -}; - -struct audit_chunk { - struct list_head hash; - struct inotify_watch watch; - struct list_head trees; /* with root here */ - int dead; - int count; - atomic_long_t refs; - struct rcu_head head; - struct node { - struct list_head list; - struct audit_tree *owner; - unsigned index; /* index; upper bit indicates 'will prune' */ - } owners[]; -}; - -static LIST_HEAD(tree_list); -static LIST_HEAD(prune_list); - -/* - * One struct chunk is attached to each inode of interest. - * We replace struct chunk on tagging/untagging. - * Rules have pointer to struct audit_tree. - * Rules have struct list_head rlist forming a list of rules over - * the same tree. - * References to struct chunk are collected at audit_inode{,_child}() - * time and used in AUDIT_TREE rule matching. - * These references are dropped at the same time we are calling - * audit_free_names(), etc. - * - * Cyclic lists galore: - * tree.chunks anchors chunk.owners[].list hash_lock - * tree.rules anchors rule.rlist audit_filter_mutex - * chunk.trees anchors tree.same_root hash_lock - * chunk.hash is a hash with middle bits of watch.inode as - * a hash function. RCU, hash_lock - * - * tree is refcounted; one reference for "some rules on rules_list refer to - * it", one for each chunk with pointer to it. - * - * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount - * of watch contributes 1 to .refs). - * - * node.index allows to get from node.list to containing chunk. - * MSB of that sucker is stolen to mark taggings that we might have to - * revert - several operations have very unpleasant cleanup logics and - * that makes a difference. Some. - */ - -static struct inotify_handle *rtree_ih; - -static struct audit_tree *alloc_tree(const char *s) -{ - struct audit_tree *tree; - - tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); - if (tree) { - atomic_set(&tree->count, 1); - tree->goner = 0; - INIT_LIST_HEAD(&tree->chunks); - INIT_LIST_HEAD(&tree->rules); - INIT_LIST_HEAD(&tree->list); - INIT_LIST_HEAD(&tree->same_root); - tree->root = NULL; - strcpy(tree->pathname, s); - } - return tree; -} - -static inline void get_tree(struct audit_tree *tree) -{ - atomic_inc(&tree->count); -} - -static void __put_tree(struct rcu_head *rcu) -{ - struct audit_tree *tree = container_of(rcu, struct audit_tree, head); - kfree(tree); -} - -static inline void put_tree(struct audit_tree *tree) -{ - if (atomic_dec_and_test(&tree->count)) - call_rcu(&tree->head, __put_tree); -} - -/* to avoid bringing the entire thing in audit.h */ -const char *audit_tree_path(struct audit_tree *tree) -{ - return tree->pathname; -} - -static struct audit_chunk *alloc_chunk(int count) -{ - struct audit_chunk *chunk; - size_t size; - int i; - - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); - if (!chunk) - return NULL; - - INIT_LIST_HEAD(&chunk->hash); - INIT_LIST_HEAD(&chunk->trees); - chunk->count = count; - atomic_long_set(&chunk->refs, 1); - for (i = 0; i < count; i++) { - INIT_LIST_HEAD(&chunk->owners[i].list); - chunk->owners[i].index = i; - } - inotify_init_watch(&chunk->watch); - return chunk; -} - -static void free_chunk(struct audit_chunk *chunk) -{ - int i; - - for (i = 0; i < chunk->count; i++) { - if (chunk->owners[i].owner) - put_tree(chunk->owners[i].owner); - } - kfree(chunk); -} - -void audit_put_chunk(struct audit_chunk *chunk) -{ - if (atomic_long_dec_and_test(&chunk->refs)) - free_chunk(chunk); -} - -static void __put_chunk(struct rcu_head *rcu) -{ - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); - audit_put_chunk(chunk); -} - -enum {HASH_SIZE = 128}; -static struct list_head chunk_hash_heads[HASH_SIZE]; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); - -static inline struct list_head *chunk_hash(const struct inode *inode) -{ - unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; - return chunk_hash_heads + n % HASH_SIZE; -} - -/* hash_lock is held by caller */ -static void insert_hash(struct audit_chunk *chunk) -{ - struct list_head *list = chunk_hash(chunk->watch.inode); - list_add_rcu(&chunk->hash, list); -} - -/* called under rcu_read_lock */ -struct audit_chunk *audit_tree_lookup(const struct inode *inode) -{ - struct list_head *list = chunk_hash(inode); - struct audit_chunk *p; - - list_for_each_entry_rcu(p, list, hash) { - if (p->watch.inode == inode) { - atomic_long_inc(&p->refs); - return p; - } - } - return NULL; -} - -int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) -{ - int n; - for (n = 0; n < chunk->count; n++) - if (chunk->owners[n].owner == tree) - return 1; - return 0; -} - -/* tagging and untagging inodes with trees */ - -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - -static void untag_chunk(struct node *p) -{ - struct audit_chunk *chunk = find_chunk(p); - struct audit_chunk *new; - struct audit_tree *owner; - int size = chunk->count - 1; - int i, j; - - if (!pin_inotify_watch(&chunk->watch)) { - /* - * Filesystem is shutting down; all watches are getting - * evicted, just take it off the node list for this - * tree and let the eviction logics take care of the - * rest. - */ - owner = p->owner; - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - return; - } - - spin_unlock(&hash_lock); - - /* - * pin_inotify_watch() succeeded, so the watch won't go away - * from under us. - */ - mutex_lock(&chunk->watch.inode->inotify_mutex); - if (chunk->dead) { - mutex_unlock(&chunk->watch.inode->inotify_mutex); - goto out; - } - - owner = p->owner; - - if (!size) { - chunk->dead = 1; - spin_lock(&hash_lock); - list_del_init(&chunk->trees); - if (owner->root == chunk) - owner->root = NULL; - list_del_init(&p->list); - list_del_rcu(&chunk->hash); - spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); - goto out; - } - - new = alloc_chunk(size); - if (!new) - goto Fallback; - if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { - free_chunk(new); - goto Fallback; - } - - chunk->dead = 1; - spin_lock(&hash_lock); - list_replace_init(&chunk->trees, &new->trees); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - - for (i = j = 0; j <= size; i++, j++) { - struct audit_tree *s; - if (&chunk->owners[j] == p) { - list_del_init(&p->list); - i--; - continue; - } - s = chunk->owners[j].owner; - new->owners[i].owner = s; - new->owners[i].index = chunk->owners[j].index - j + i; - if (!s) /* result of earlier fallback */ - continue; - get_tree(s); - list_replace_init(&chunk->owners[j].list, &new->owners[i].list); - } - - list_replace_rcu(&chunk->hash, &new->hash); - list_for_each_entry(owner, &new->trees, same_root) - owner->root = new; - spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); - goto out; - -Fallback: - // do the best we can - spin_lock(&hash_lock); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - spin_unlock(&hash_lock); - mutex_unlock(&chunk->watch.inode->inotify_mutex); -out: - unpin_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); -} - -static int create_chunk(struct inode *inode, struct audit_tree *tree) -{ - struct audit_chunk *chunk = alloc_chunk(1); - if (!chunk) - return -ENOMEM; - - if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { - free_chunk(chunk); - return -ENOSPC; - } - - mutex_lock(&inode->inotify_mutex); - spin_lock(&hash_lock); - if (tree->goner) { - spin_unlock(&hash_lock); - chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&chunk->watch); - return 0; - } - chunk->owners[0].index = (1U << 31); - chunk->owners[0].owner = tree; - get_tree(tree); - list_add(&chunk->owners[0].list, &tree->chunks); - if (!tree->root) { - tree->root = chunk; - list_add(&tree->same_root, &chunk->trees); - } - insert_hash(chunk); - spin_unlock(&hash_lock); - mutex_unlock(&inode->inotify_mutex); - return 0; -} - -/* the first tagged inode becomes root of tree */ -static int tag_chunk(struct inode *inode, struct audit_tree *tree) -{ - struct inotify_watch *watch; - struct audit_tree *owner; - struct audit_chunk *chunk, *old; - struct node *p; - int n; - - if (inotify_find_watch(rtree_ih, inode, &watch) < 0) - return create_chunk(inode, tree); - - old = container_of(watch, struct audit_chunk, watch); - - /* are we already there? */ - spin_lock(&hash_lock); - for (n = 0; n < old->count; n++) { - if (old->owners[n].owner == tree) { - spin_unlock(&hash_lock); - put_inotify_watch(&old->watch); - return 0; - } - } - spin_unlock(&hash_lock); - - chunk = alloc_chunk(old->count + 1); - if (!chunk) { - put_inotify_watch(&old->watch); - return -ENOMEM; - } - - mutex_lock(&inode->inotify_mutex); - if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { - mutex_unlock(&inode->inotify_mutex); - free_chunk(chunk); - return -ENOSPC; - } - spin_lock(&hash_lock); - if (tree->goner) { - spin_unlock(&hash_lock); - chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&chunk->watch); - return 0; - } - list_replace_init(&old->trees, &chunk->trees); - for (n = 0, p = chunk->owners; n < old->count; n++, p++) { - struct audit_tree *s = old->owners[n].owner; - p->owner = s; - p->index = old->owners[n].index; - if (!s) /* result of fallback in untag */ - continue; - get_tree(s); - list_replace_init(&old->owners[n].list, &p->list); - } - p->index = (chunk->count - 1) | (1U<<31); - p->owner = tree; - get_tree(tree); - list_add(&p->list, &tree->chunks); - list_replace_rcu(&old->hash, &chunk->hash); - list_for_each_entry(owner, &chunk->trees, same_root) - owner->root = chunk; - old->dead = 1; - if (!tree->root) { - tree->root = chunk; - list_add(&tree->same_root, &chunk->trees); - } - spin_unlock(&hash_lock); - inotify_evict_watch(&old->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ - put_inotify_watch(&old->watch); /* and kill it */ - return 0; -} - -static void kill_rules(struct audit_tree *tree) -{ - struct audit_krule *rule, *next; - struct audit_entry *entry; - struct audit_buffer *ab; - - list_for_each_entry_safe(rule, next, &tree->rules, rlist) { - entry = container_of(rule, struct audit_entry, rule); - - list_del_init(&rule->rlist); - if (rule->tree) { - /* not a half-baked one */ - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule dir="); - audit_log_untrustedstring(ab, rule->tree->pathname); - if (rule->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, rule->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", rule->listnr); - audit_log_end(ab); - rule->tree = NULL; - list_del_rcu(&entry->list); - call_rcu(&entry->rcu, audit_free_rule_rcu); - } - } -} - -/* - * finish killing struct audit_tree - */ -static void prune_one(struct audit_tree *victim) -{ - spin_lock(&hash_lock); - while (!list_empty(&victim->chunks)) { - struct node *p; - - p = list_entry(victim->chunks.next, struct node, list); - - untag_chunk(p); - } - spin_unlock(&hash_lock); - put_tree(victim); -} - -/* trim the uncommitted chunks from tree */ - -static void trim_marked(struct audit_tree *tree) -{ - struct list_head *p, *q; - spin_lock(&hash_lock); - if (tree->goner) { - spin_unlock(&hash_lock); - return; - } - /* reorder */ - for (p = tree->chunks.next; p != &tree->chunks; p = q) { - struct node *node = list_entry(p, struct node, list); - q = p->next; - if (node->index & (1U<<31)) { - list_del_init(p); - list_add(p, &tree->chunks); - } - } - - while (!list_empty(&tree->chunks)) { - struct node *node; - - node = list_entry(tree->chunks.next, struct node, list); - - /* have we run out of marked? */ - if (!(node->index & (1U<<31))) - break; - - untag_chunk(node); - } - if (!tree->root && !tree->goner) { - tree->goner = 1; - spin_unlock(&hash_lock); - mutex_lock(&audit_filter_mutex); - kill_rules(tree); - list_del_init(&tree->list); - mutex_unlock(&audit_filter_mutex); - prune_one(tree); - } else { - spin_unlock(&hash_lock); - } -} - -/* called with audit_filter_mutex */ -int audit_remove_tree_rule(struct audit_krule *rule) -{ - struct audit_tree *tree; - tree = rule->tree; - if (tree) { - spin_lock(&hash_lock); - list_del_init(&rule->rlist); - if (list_empty(&tree->rules) && !tree->goner) { - tree->root = NULL; - list_del_init(&tree->same_root); - tree->goner = 1; - list_move(&tree->list, &prune_list); - rule->tree = NULL; - spin_unlock(&hash_lock); - audit_schedule_prune(); - return 1; - } - rule->tree = NULL; - spin_unlock(&hash_lock); - return 1; - } - return 0; -} - -void audit_trim_trees(void) -{ - struct list_head cursor; - - mutex_lock(&audit_filter_mutex); - list_add(&cursor, &tree_list); - while (cursor.next != &tree_list) { - struct audit_tree *tree; - struct nameidata nd; - struct vfsmount *root_mnt; - struct node *node; - struct list_head list; - int err; - - tree = container_of(cursor.next, struct audit_tree, list); - get_tree(tree); - list_del(&cursor); - list_add(&cursor, &tree->list); - mutex_unlock(&audit_filter_mutex); - - err = path_lookup(tree->pathname, 0, &nd); - if (err) - goto skip_it; - - root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); - if (!root_mnt) - goto skip_it; - - list_add_tail(&list, &root_mnt->mnt_list); - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) { - struct audit_chunk *chunk = find_chunk(node); - struct inode *inode = chunk->watch.inode; - struct vfsmount *mnt; - node->index |= 1U<<31; - list_for_each_entry(mnt, &list, mnt_list) { - if (mnt->mnt_root->d_inode == inode) { - node->index &= ~(1U<<31); - break; - } - } - } - spin_unlock(&hash_lock); - trim_marked(tree); - put_tree(tree); - list_del_init(&list); - drop_collected_mounts(root_mnt); -skip_it: - mutex_lock(&audit_filter_mutex); - } - list_del(&cursor); - mutex_unlock(&audit_filter_mutex); -} - -static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct nameidata *nd) -{ - if (mnt != nd->path.mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) - return 0; - if (mnt->mnt_parent == nd->path.mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - return is_subdir(dentry, nd->path.dentry); -} - -int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) -{ - - if (pathname[0] != '/' || - rule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || - rule->inode_f || rule->watch || rule->tree) - return -EINVAL; - rule->tree = alloc_tree(pathname); - if (!rule->tree) - return -ENOMEM; - return 0; -} - -void audit_put_tree(struct audit_tree *tree) -{ - put_tree(tree); -} - -/* called with audit_filter_mutex */ -int audit_add_tree_rule(struct audit_krule *rule) -{ - struct audit_tree *seed = rule->tree, *tree; - struct nameidata nd; - struct vfsmount *mnt, *p; - struct list_head list; - int err; - - list_for_each_entry(tree, &tree_list, list) { - if (!strcmp(seed->pathname, tree->pathname)) { - put_tree(seed); - rule->tree = tree; - list_add(&rule->rlist, &tree->rules); - return 0; - } - } - tree = seed; - list_add(&tree->list, &tree_list); - list_add(&rule->rlist, &tree->rules); - /* do not set rule->tree yet */ - mutex_unlock(&audit_filter_mutex); - - err = path_lookup(tree->pathname, 0, &nd); - if (err) - goto Err; - mnt = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); - if (!mnt) { - err = -ENOMEM; - goto Err; - } - list_add_tail(&list, &mnt->mnt_list); - - get_tree(tree); - list_for_each_entry(p, &list, mnt_list) { - err = tag_chunk(p->mnt_root->d_inode, tree); - if (err) - break; - } - - list_del(&list); - drop_collected_mounts(mnt); - - if (!err) { - struct node *node; - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) - node->index &= ~(1U<<31); - spin_unlock(&hash_lock); - } else { - trim_marked(tree); - goto Err; - } - - mutex_lock(&audit_filter_mutex); - if (list_empty(&rule->rlist)) { - put_tree(tree); - return -ENOENT; - } - rule->tree = tree; - put_tree(tree); - - return 0; -Err: - mutex_lock(&audit_filter_mutex); - list_del_init(&tree->list); - list_del_init(&tree->rules); - put_tree(tree); - return err; -} - -int audit_tag_tree(char *old, char *new) -{ - struct list_head cursor, barrier; - int failed = 0; - struct nameidata nd; - struct vfsmount *tagged; - struct list_head list; - struct vfsmount *mnt; - struct dentry *dentry; - int err; - - err = path_lookup(new, 0, &nd); - if (err) - return err; - tagged = collect_mounts(nd.path.mnt, nd.path.dentry); - path_put(&nd.path); - if (!tagged) - return -ENOMEM; - - err = path_lookup(old, 0, &nd); - if (err) { - drop_collected_mounts(tagged); - return err; - } - mnt = mntget(nd.path.mnt); - dentry = dget(nd.path.dentry); - path_put(&nd.path); - - if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) - follow_up(&mnt, &dentry); - - list_add_tail(&list, &tagged->mnt_list); - - mutex_lock(&audit_filter_mutex); - list_add(&barrier, &tree_list); - list_add(&cursor, &barrier); - - while (cursor.next != &tree_list) { - struct audit_tree *tree; - struct vfsmount *p; - - tree = container_of(cursor.next, struct audit_tree, list); - get_tree(tree); - list_del(&cursor); - list_add(&cursor, &tree->list); - mutex_unlock(&audit_filter_mutex); - - err = path_lookup(tree->pathname, 0, &nd); - if (err) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; - } - - spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &nd)) { - spin_unlock(&vfsmount_lock); - path_put(&nd.path); - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; - } - spin_unlock(&vfsmount_lock); - path_put(&nd.path); - - list_for_each_entry(p, &list, mnt_list) { - failed = tag_chunk(p->mnt_root->d_inode, tree); - if (failed) - break; - } - - if (failed) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - break; - } - - mutex_lock(&audit_filter_mutex); - spin_lock(&hash_lock); - if (!tree->goner) { - list_del(&tree->list); - list_add(&tree->list, &tree_list); - } - spin_unlock(&hash_lock); - put_tree(tree); - } - - while (barrier.prev != &tree_list) { - struct audit_tree *tree; - - tree = container_of(barrier.prev, struct audit_tree, list); - get_tree(tree); - list_del(&tree->list); - list_add(&tree->list, &barrier); - mutex_unlock(&audit_filter_mutex); - - if (!failed) { - struct node *node; - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) - node->index &= ~(1U<<31); - spin_unlock(&hash_lock); - } else { - trim_marked(tree); - } - - put_tree(tree); - mutex_lock(&audit_filter_mutex); - } - list_del(&barrier); - list_del(&cursor); - list_del(&list); - mutex_unlock(&audit_filter_mutex); - dput(dentry); - mntput(mnt); - drop_collected_mounts(tagged); - return failed; -} - -/* - * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread, with audit_cmd_mutex held. - */ -void audit_prune_trees(void) -{ - mutex_lock(&audit_filter_mutex); - - while (!list_empty(&prune_list)) { - struct audit_tree *victim; - - victim = list_entry(prune_list.next, struct audit_tree, list); - list_del_init(&victim->list); - - mutex_unlock(&audit_filter_mutex); - - prune_one(victim); - - mutex_lock(&audit_filter_mutex); - } - - mutex_unlock(&audit_filter_mutex); -} - -/* - * Here comes the stuff asynchronous to auditctl operations - */ - -/* inode->inotify_mutex is locked */ -static void evict_chunk(struct audit_chunk *chunk) -{ - struct audit_tree *owner; - int n; - - if (chunk->dead) - return; - - chunk->dead = 1; - mutex_lock(&audit_filter_mutex); - spin_lock(&hash_lock); - while (!list_empty(&chunk->trees)) { - owner = list_entry(chunk->trees.next, - struct audit_tree, same_root); - owner->goner = 1; - owner->root = NULL; - list_del_init(&owner->same_root); - spin_unlock(&hash_lock); - kill_rules(owner); - list_move(&owner->list, &prune_list); - audit_schedule_prune(); - spin_lock(&hash_lock); - } - list_del_rcu(&chunk->hash); - for (n = 0; n < chunk->count; n++) - list_del_init(&chunk->owners[n].list); - spin_unlock(&hash_lock); - mutex_unlock(&audit_filter_mutex); -} - -static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) -{ - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - - if (mask & IN_IGNORED) { - evict_chunk(chunk); - put_inotify_watch(watch); - } -} - -static void destroy_watch(struct inotify_watch *watch) -{ - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - call_rcu(&chunk->head, __put_chunk); -} - -static const struct inotify_operations rtree_inotify_ops = { - .handle_event = handle_event, - .destroy_watch = destroy_watch, -}; - -static int __init audit_tree_init(void) -{ - int i; - - rtree_ih = inotify_init(&rtree_inotify_ops); - if (IS_ERR(rtree_ih)) - audit_panic("cannot initialize inotify handle for rectree watches"); - - for (i = 0; i < HASH_SIZE; i++) - INIT_LIST_HEAD(&chunk_hash_heads[i]); - - return 0; -} -__initcall(audit_tree_init); -/* auditfilter.c -- filtering of audit events - * - * Copyright 2003-2004 Red Hat, Inc. - * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "audit.h" - -/* - * Locking model: - * - * audit_filter_mutex: - * Synchronizes writes and blocking reads of audit's filterlist - * data. Rcu is used to traverse the filterlist and access - * contents of structs audit_entry, audit_watch and opaque - * LSM rules during filtering. If modified, these structures - * must be copied and replace their counterparts in the filterlist. - * An audit_parent struct is not accessed during filtering, so may - * be written directly provided audit_filter_mutex is held. - */ - -/* - * Reference counting: - * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED - * event. Each audit_watch holds a reference to its associated parent. - * - * audit_watch: if added to lists, lifetime is from audit_init_watch() to - * audit_remove_watch(). Additionally, an audit_watch may exist - * temporarily to assist in searching existing filter data. Each - * audit_krule holds a reference to its associated watch. - */ - -struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ -}; - -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 - -/* Audit filter lists, defined in */ -struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { - LIST_HEAD_INIT(audit_filter_list[0]), - LIST_HEAD_INIT(audit_filter_list[1]), - LIST_HEAD_INIT(audit_filter_list[2]), - LIST_HEAD_INIT(audit_filter_list[3]), - LIST_HEAD_INIT(audit_filter_list[4]), - LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 -#error Fix audit_filter_list initialiser -#endif -}; - -DEFINE_MUTEX(audit_filter_mutex); - -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF - -void audit_free_parent(struct inotify_watch *i_watch) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); -} - -static inline void audit_get_watch(struct audit_watch *watch) -{ - atomic_inc(&watch->count); -} - -static void audit_put_watch(struct audit_watch *watch) -{ - if (atomic_dec_and_test(&watch->count)) { - WARN_ON(watch->parent); - WARN_ON(!list_empty(&watch->rules)); - kfree(watch->path); - kfree(watch); - } -} - -static void audit_remove_watch(struct audit_watch *watch) -{ - list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); - watch->parent = NULL; - audit_put_watch(watch); /* match initial get */ -} - -static inline void audit_free_rule(struct audit_entry *e) -{ - int i; - - /* some rules don't have associated watches */ - if (e->rule.watch) - audit_put_watch(e->rule.watch); - if (e->rule.fields) - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - kfree(f->lsm_str); - security_audit_rule_free(f->lsm_rule); - } - kfree(e->rule.fields); - kfree(e->rule.filterkey); - kfree(e); -} - -void audit_free_rule_rcu(struct rcu_head *head) -{ - struct audit_entry *e = container_of(head, struct audit_entry, rcu); - audit_free_rule(e); -} - -/* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) -{ - struct audit_parent *parent; - s32 wd; - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (unlikely(!parent)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); - } - - return parent; -} - -/* Initialize a watch entry. */ -static struct audit_watch *audit_init_watch(char *path) -{ - struct audit_watch *watch; - - watch = kzalloc(sizeof(*watch), GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); - watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; - - return watch; -} - -/* Initialize an audit filterlist entry. */ -static inline struct audit_entry *audit_init_entry(u32 field_count) -{ - struct audit_entry *entry; - struct audit_field *fields; - - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (unlikely(!entry)) - return NULL; - - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); - if (unlikely(!fields)) { - kfree(entry); - return NULL; - } - entry->rule.fields = fields; - - return entry; -} - -/* Unpack a filter field's string representation from user-space - * buffer. */ -char *audit_unpack_string(void **bufp, size_t *remain, size_t len) -{ - char *str; - - if (!*bufp || (len == 0) || (len > *remain)) - return ERR_PTR(-EINVAL); - - /* Of the currently implemented string fields, PATH_MAX - * defines the longest valid length. - */ - if (len > PATH_MAX) - return ERR_PTR(-ENAMETOOLONG); - - str = kmalloc(len + 1, GFP_KERNEL); - if (unlikely(!str)) - return ERR_PTR(-ENOMEM); - - memcpy(str, *bufp, len); - str[len] = 0; - *bufp += len; - *remain -= len; - - return str; -} - -/* Translate an inode field to kernel respresentation. */ -static inline int audit_to_inode(struct audit_krule *krule, - struct audit_field *f) -{ - if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree) - return -EINVAL; - - krule->inode_f = f; - return 0; -} - -/* Translate a watch string to kernel respresentation. */ -static int audit_to_watch(struct audit_krule *krule, char *path, int len, - u32 op) -{ - struct audit_watch *watch; - - if (!audit_ih) - return -EOPNOTSUPP; - - if (path[0] != '/' || path[len-1] == '/' || - krule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || - krule->inode_f || krule->watch || krule->tree) - return -EINVAL; - - watch = audit_init_watch(path); - if (IS_ERR(watch)) - return PTR_ERR(watch); - - audit_get_watch(watch); - krule->watch = watch; - - return 0; -} - -static __u32 *classes[AUDIT_SYSCALL_CLASSES]; - -int __init audit_register_class(int class, unsigned *list) -{ - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); - if (!p) - return -ENOMEM; - while (*list != ~0U) { - unsigned n = *list++; - if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { - kfree(p); - return -EINVAL; - } - p[AUDIT_WORD(n)] |= AUDIT_BIT(n); - } - if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { - kfree(p); - return -EINVAL; - } - classes[class] = p; - return 0; -} - -int audit_match_class(int class, unsigned syscall) -{ - if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) - return 0; - if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) - return 0; - return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); -} - -#ifdef CONFIG_AUDITSYSCALL -static inline int audit_match_class_bits(int class, u32 *mask) -{ - int i; - - if (classes[class]) { - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (mask[i] & classes[class][i]) - return 0; - } - return 1; -} - -static int audit_match_signal(struct audit_entry *entry) -{ - struct audit_field *arch = entry->rule.arch_f; - - if (!arch) { - /* When arch is unspecified, we must check both masks on biarch - * as syscall number alone is ambiguous. */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, - entry->rule.mask) && - audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, - entry->rule.mask)); - } - - switch(audit_classify_arch(arch->val)) { - case 0: /* native */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, - entry->rule.mask)); - case 1: /* 32bit on biarch */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, - entry->rule.mask)); - default: - return 1; - } -} -#endif - -/* Common user-space to kernel rule translation. */ -static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) -{ - unsigned listnr; - struct audit_entry *entry; - int i, err; - - err = -EINVAL; - listnr = rule->flags & ~AUDIT_FILTER_PREPEND; - switch(listnr) { - default: - goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: -#ifdef CONFIG_AUDITSYSCALL - case AUDIT_FILTER_ENTRY: - case AUDIT_FILTER_EXIT: - case AUDIT_FILTER_TASK: -#endif - ; - } - if (unlikely(rule->action == AUDIT_POSSIBLE)) { - printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); - goto exit_err; - } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) - goto exit_err; - if (rule->field_count > AUDIT_MAX_FIELDS) - goto exit_err; - - err = -ENOMEM; - entry = audit_init_entry(rule->field_count); - if (!entry) - goto exit_err; - - entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; - entry->rule.listnr = listnr; - entry->rule.action = rule->action; - entry->rule.field_count = rule->field_count; - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - entry->rule.mask[i] = rule->mask[i]; - - for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { - int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; - __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; - __u32 *class; - - if (!(*p & AUDIT_BIT(bit))) - continue; - *p &= ~AUDIT_BIT(bit); - class = classes[i]; - if (class) { - int j; - for (j = 0; j < AUDIT_BITMASK_SIZE; j++) - entry->rule.mask[j] |= class[j]; - } - } - - return entry; - -exit_err: - return ERR_PTR(err); -} - -/* Translate struct audit_rule to kernel's rule respresentation. - * Exists for backward compatibility with userspace. */ -static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) -{ - struct audit_entry *entry; - struct audit_field *ino_f; - int err = 0; - int i; - - entry = audit_to_entry_common(rule); - if (IS_ERR(entry)) - goto exit_nofree; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); - f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); - f->val = rule->values[i]; - - err = -EINVAL; - switch(f->type) { - default: - goto exit_free; - case AUDIT_PID: - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - case AUDIT_LOGINUID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - /* bit ops are only useful on syscall args */ - if (f->op == AUDIT_BIT_MASK || - f->op == AUDIT_BIT_TEST) { - err = -EINVAL; - goto exit_free; - } - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - /* arch is only allowed to be = or != */ - case AUDIT_ARCH: - if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) - && (f->op != AUDIT_NEGATE) && (f->op)) { - err = -EINVAL; - goto exit_free; - } - entry->rule.arch_f = f; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - } - - entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (f->op & AUDIT_NEGATE) - f->op = AUDIT_NOT_EQUAL; - else if (!f->op) - f->op = AUDIT_EQUAL; - else if (f->op == AUDIT_OPERATORS) { - err = -EINVAL; - goto exit_free; - } - } - - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } - -exit_nofree: - return entry; - -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); -} - -/* Translate struct audit_rule_data to kernel's rule respresentation. */ -static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, - size_t datasz) -{ - int err = 0; - struct audit_entry *entry; - struct audit_field *ino_f; - void *bufp; - size_t remain = datasz - sizeof(struct audit_rule_data); - int i; - char *str; - - entry = audit_to_entry_common((struct audit_rule *)data); - if (IS_ERR(entry)) - goto exit_nofree; - - bufp = data->buf; - entry->rule.vers_ops = 2; - for (i = 0; i < data->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - - err = -EINVAL; - if (!(data->fieldflags[i] & AUDIT_OPERATORS) || - data->fieldflags[i] & ~AUDIT_OPERATORS) - goto exit_free; - - f->op = data->fieldflags[i] & AUDIT_OPERATORS; - f->type = data->fields[i]; - f->val = data->values[i]; - f->lsm_str = NULL; - f->lsm_rule = NULL; - switch(f->type) { - case AUDIT_PID: - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - case AUDIT_LOGINUID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - case AUDIT_ARCH: - entry->rule.arch_f = f; - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); - /* Keep currently invalid fields around in case they - * become valid after a policy reload. */ - if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM " - "\'%s\' is invalid\n", str); - err = 0; - } - if (err) { - kfree(str); - goto exit_free; - } else - f->lsm_str = str; - break; - case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); - if (err) { - kfree(str); - goto exit_free; - } - break; - case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = audit_make_tree(&entry->rule, str, f->op); - kfree(str); - if (err) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - case AUDIT_FILTERKEY: - err = -EINVAL; - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) - goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - entry->rule.filterkey = str; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) - goto exit_free; - break; - default: - goto exit_free; - } - } - - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } - -exit_nofree: - return entry; - -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); -} - -/* Pack a filter field's string representation into data block. */ -static inline size_t audit_pack_string(void **bufp, const char *str) -{ - size_t len = strlen(str); - - memcpy(*bufp, str, len); - *bufp += len; - - return len; -} - -/* Translate kernel rule respresentation to struct audit_rule. - * Exists for backward compatibility with userspace. */ -static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) -{ - struct audit_rule *rule; - int i; - - rule = kzalloc(sizeof(*rule), GFP_KERNEL); - if (unlikely(!rule)) - return NULL; - - rule->flags = krule->flags | krule->listnr; - rule->action = krule->action; - rule->field_count = krule->field_count; - for (i = 0; i < rule->field_count; i++) { - rule->values[i] = krule->fields[i].val; - rule->fields[i] = krule->fields[i].type; - - if (krule->vers_ops == 1) { - if (krule->fields[i].op & AUDIT_NOT_EQUAL) - rule->fields[i] |= AUDIT_NEGATE; - } else { - rule->fields[i] |= krule->fields[i].op; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; - - return rule; -} - -/* Translate kernel rule respresentation to struct audit_rule_data. */ -static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) -{ - struct audit_rule_data *data; - void *bufp; - int i; - - data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); - if (unlikely(!data)) - return NULL; - memset(data, 0, sizeof(*data)); - - data->flags = krule->flags | krule->listnr; - data->action = krule->action; - data->field_count = krule->field_count; - bufp = data->buf; - for (i = 0; i < data->field_count; i++) { - struct audit_field *f = &krule->fields[i]; - - data->fields[i] = f->type; - data->fieldflags[i] = f->op; - switch(f->type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - data->buflen += data->values[i] = - audit_pack_string(&bufp, f->lsm_str); - break; - case AUDIT_WATCH: - data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->watch->path); - break; - case AUDIT_DIR: - data->buflen += data->values[i] = - audit_pack_string(&bufp, - audit_tree_path(krule->tree)); - break; - case AUDIT_FILTERKEY: - data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->filterkey); - break; - default: - data->values[i] = f->val; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; - - return data; -} - -/* Compare two rules in kernel format. Considered success if rules - * don't match. */ -static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) -{ - int i; - - if (a->flags != b->flags || - a->listnr != b->listnr || - a->action != b->action || - a->field_count != b->field_count) - return 1; - - for (i = 0; i < a->field_count; i++) { - if (a->fields[i].type != b->fields[i].type || - a->fields[i].op != b->fields[i].op) - return 1; - - switch(a->fields[i].type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) - return 1; - break; - case AUDIT_WATCH: - if (strcmp(a->watch->path, b->watch->path)) - return 1; - break; - case AUDIT_DIR: - if (strcmp(audit_tree_path(a->tree), - audit_tree_path(b->tree))) - return 1; - break; - case AUDIT_FILTERKEY: - /* both filterkeys exist based on above type compare */ - if (strcmp(a->filterkey, b->filterkey)) - return 1; - break; - default: - if (a->fields[i].val != b->fields[i].val) - return 1; - } - } - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (a->mask[i] != b->mask[i]) - return 1; - - return 0; -} - -/* Duplicate the given audit watch. The new watch's rules list is initialized - * to an empty list and wlist is undefined. */ -static struct audit_watch *audit_dupe_watch(struct audit_watch *old) -{ - char *path; - struct audit_watch *new; - - path = kstrdup(old->path, GFP_KERNEL); - if (unlikely(!path)) - return ERR_PTR(-ENOMEM); - - new = audit_init_watch(path); - if (IS_ERR(new)) { - kfree(path); - goto out; - } - - new->dev = old->dev; - new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); - new->parent = old->parent; - -out: - return new; -} - -/* Duplicate LSM field information. The lsm_rule is opaque, so must be - * re-initialized. */ -static inline int audit_dupe_lsm_field(struct audit_field *df, - struct audit_field *sf) -{ - int ret = 0; - char *lsm_str; - - /* our own copy of lsm_str */ - lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); - if (unlikely(!lsm_str)) - return -ENOMEM; - df->lsm_str = lsm_str; - - /* our own (refreshed) copy of lsm_rule */ - ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); - /* Keep currently invalid fields around in case they - * become valid after a policy reload. */ - if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM \'%s\' is " - "invalid\n", df->lsm_str); - ret = 0; - } - - return ret; -} - -/* Duplicate an audit rule. This will be a deep copy with the exception - * of the watch - that pointer is carried over. The LSM specific fields - * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. - * The rlist element is undefined; list manipulations are handled apart from - * the initial copy. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) -{ - u32 fcount = old->field_count; - struct audit_entry *entry; - struct audit_krule *new; - char *fk; - int i, err = 0; - - entry = audit_init_entry(fcount); - if (unlikely(!entry)) - return ERR_PTR(-ENOMEM); - - new = &entry->rule; - new->vers_ops = old->vers_ops; - new->flags = old->flags; - new->listnr = old->listnr; - new->action = old->action; - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - new->mask[i] = old->mask[i]; - new->buflen = old->buflen; - new->inode_f = old->inode_f; - new->watch = NULL; - new->field_count = old->field_count; - /* - * note that we are OK with not refcounting here; audit_match_tree() - * never dereferences tree and we can't get false positives there - * since we'd have to have rule gone from the list *and* removed - * before the chunks found by lookup had been allocated, i.e. before - * the beginning of list scan. - */ - new->tree = old->tree; - memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); - - /* deep copy this information, updating the lsm_rule fields, because - * the originals will all be freed when the old rule is freed. */ - for (i = 0; i < fcount; i++) { - switch (new->fields[i].type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - err = audit_dupe_lsm_field(&new->fields[i], - &old->fields[i]); - break; - case AUDIT_FILTERKEY: - fk = kstrdup(old->filterkey, GFP_KERNEL); - if (unlikely(!fk)) - err = -ENOMEM; - else - new->filterkey = fk; - } - if (err) { - audit_free_rule(entry); - return ERR_PTR(err); - } - } - - if (watch) { - audit_get_watch(watch); - new->watch = watch; - } - - return entry; -} - -/* Update inode info in audit rules based on filesystem event. */ -static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, - unsigned long ino, unsigned invalidating) -{ - struct audit_watch *owatch, *nwatch, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *oentry, *nentry; - - mutex_lock(&audit_filter_mutex); - list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) - continue; - - /* If the update involves invalidating rules, do the inode-based - * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context && - audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) - audit_set_auditable(current->audit_context); - - nwatch = audit_dupe_watch(owatch); - if (IS_ERR(nwatch)) { - mutex_unlock(&audit_filter_mutex); - audit_panic("error updating watch, skipping"); - return; - } - nwatch->dev = dev; - nwatch->ino = ino; - - list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { - - oentry = container_of(r, struct audit_entry, rule); - list_del(&oentry->rule.rlist); - list_del_rcu(&oentry->list); - - nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) - audit_panic("error updating watch, removing"); - else { - int h = audit_hash_ino((u32)ino); - list_add(&nentry->rule.rlist, &nwatch->rules); - list_add_rcu(&nentry->list, &audit_inode_hash[h]); - } - - call_rcu(&oentry->rcu, audit_free_rule_rcu); - } - - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, - " op=updated rules specifying path="); - audit_log_untrustedstring(ab, owatch->path); - audit_log_format(ab, " with dev=%u ino=%lu\n", - dev, ino); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } - audit_remove_watch(owatch); - goto add_watch_to_parent; /* event applies to a single watch */ - } - mutex_unlock(&audit_filter_mutex); - return; - -add_watch_to_parent: - list_add(&nwatch->wlist, &parent->watches); - mutex_unlock(&audit_filter_mutex); - return; -} - -/* Remove all watches & rules associated with a parent that is going away. */ -static void audit_remove_parent_watches(struct audit_parent *parent) -{ - struct audit_watch *w, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *e; - - mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; - list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { - list_for_each_entry_safe(r, nextr, &w->rules, rlist) { - e = container_of(r, struct audit_entry, rule); - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, - AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_format(ab, " op=remove rule path="); - audit_log_untrustedstring(ab, w->path); - if (r->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, - r->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d res=1", - r->listnr); - audit_log_end(ab); - } - list_del(&r->rlist); - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - } - audit_remove_watch(w); - } - mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -static void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } -} - -/* Find an existing audit rule. - * Caller must hold audit_filter_mutex to prevent stale rule data. */ -static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head *list) -{ - struct audit_entry *e, *found = NULL; - int h; - - if (entry->rule.watch) { - /* we don't know the inode number, so must walk entire hash */ - for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { - list = &audit_inode_hash[h]; - list_for_each_entry(e, list, list) - if (!audit_compare_rule(&entry->rule, &e->rule)) { - found = e; - goto out; - } - } - goto out; - } - - list_for_each_entry(e, list, list) - if (!audit_compare_rule(&entry->rule, &e->rule)) { - found = e; - goto out; - } - -out: - return found; -} - -/* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, - struct nameidata **ndw) -{ - struct nameidata *ndparent, *ndwatch; - int err; - - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; - - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; - } - - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; - } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; - } - - *ndp = ndparent; - *ndw = ndwatch; - - return 0; -} - -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - -/* Associate the given rule with an existing parent inotify_watch. - * Caller must hold audit_filter_mutex. */ -static void audit_add_to_parent(struct audit_krule *krule, - struct audit_parent *parent) -{ - struct audit_watch *w, *watch = krule->watch; - int watch_found = 0; - - list_for_each_entry(w, &parent->watches, wlist) { - if (strcmp(watch->path, w->path)) - continue; - - watch_found = 1; - - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); - audit_put_watch(watch); - - audit_get_watch(w); - krule->watch = watch = w; - break; - } - - if (!watch_found) { - get_inotify_watch(&parent->wdata); - watch->parent = parent; - - list_add(&watch->wlist, &parent->watches); - } - list_add(&krule->rlist, &watch->rules); -} - -/* Find a matching watch entry, or add this one. - * Caller must hold audit_filter_mutex. */ -static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, - struct nameidata *ndw) -{ - struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; - struct audit_parent *parent; - int ret = 0; - - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } - - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - mutex_unlock(&audit_filter_mutex); - - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { - parent = audit_init_parent(ndp); - if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - return PTR_ERR(parent); - } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); - - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); - - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); - return ret; -} - -/* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) -{ - struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; - struct audit_watch *watch = entry->rule.watch; - struct audit_tree *tree = entry->rule.tree; - struct nameidata *ndp = NULL, *ndw = NULL; - int h, err; -#ifdef CONFIG_AUDITSYSCALL - int dont_count = 0; - - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) - dont_count = 1; -#endif - - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); - mutex_unlock(&audit_filter_mutex); - if (e) { - err = -EEXIST; - /* normally audit_add_tree_rule() will free it on failure */ - if (tree) - audit_put_tree(tree); - goto error; - } - - /* Avoid calling path_lookup under audit_filter_mutex. */ - if (watch) { - err = audit_get_nd(watch->path, &ndp, &ndw); - if (err) - goto error; - } - - mutex_lock(&audit_filter_mutex); - if (watch) { - /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule, ndp, ndw); - if (err) { - mutex_unlock(&audit_filter_mutex); - goto error; - } - h = audit_hash_ino((u32)watch->ino); - list = &audit_inode_hash[h]; - } - if (tree) { - err = audit_add_tree_rule(&entry->rule); - if (err) { - mutex_unlock(&audit_filter_mutex); - goto error; - } - } - - if (entry->rule.flags & AUDIT_FILTER_PREPEND) { - list_add_rcu(&entry->list, list); - entry->rule.flags &= ~AUDIT_FILTER_PREPEND; - } else { - list_add_tail_rcu(&entry->list, list); - } -#ifdef CONFIG_AUDITSYSCALL - if (!dont_count) - audit_n_rules++; - - if (!audit_match_signal(entry)) - audit_signals++; -#endif - mutex_unlock(&audit_filter_mutex); - - audit_put_nd(ndp, ndw); /* NULL args OK */ - return 0; - -error: - audit_put_nd(ndp, ndw); /* NULL args OK */ - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ - return err; -} - -/* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry, - struct list_head *list) -{ - struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; - struct audit_watch *watch, *tmp_watch = entry->rule.watch; - struct audit_tree *tree = entry->rule.tree; - LIST_HEAD(inotify_list); - int h, ret = 0; -#ifdef CONFIG_AUDITSYSCALL - int dont_count = 0; - - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) - dont_count = 1; -#endif - - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); - if (!e) { - mutex_unlock(&audit_filter_mutex); - ret = -ENOENT; - goto out; - } - - watch = e->rule.watch; - if (watch) { - struct audit_parent *parent = watch->parent; - - list_del(&e->rule.rlist); - - if (list_empty(&watch->rules)) { - audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. - */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, &inotify_list); - } - } - } - - if (e->rule.tree) - audit_remove_tree_rule(&e->rule); - - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - -#ifdef CONFIG_AUDITSYSCALL - if (!dont_count) - audit_n_rules--; - - if (!audit_match_signal(entry)) - audit_signals--; -#endif - mutex_unlock(&audit_filter_mutex); - - if (!list_empty(&inotify_list)) - audit_inotify_unregister(&inotify_list); - -out: - if (tmp_watch) - audit_put_watch(tmp_watch); /* match initial get */ - if (tree) - audit_put_tree(tree); /* that's the temporary one */ - - return ret; -} - -/* List rules using struct audit_rule. Exists for backward - * compatibility with userspace. */ -static void audit_list(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_entry *entry; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; irule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(entry, &audit_inode_hash[i], list) { - struct audit_rule *rule; - - rule = audit_krule_to_rule(&entry->rule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - if (skb) - skb_queue_tail(q, skb); -} - -/* List rules using struct audit_rule_data. */ -static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_entry *e; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; irule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(e, &audit_inode_hash[i], list) { - struct audit_rule_data *data; - - data = audit_krule_to_data(&e->rule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); - if (skb) - skb_queue_tail(q, skb); -} - -/* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, - char *action, struct audit_krule *rule, - int res) -{ - struct audit_buffer *ab; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - if (!ab) - return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " op=%s rule key=", action); - if (rule->filterkey) - audit_log_untrustedstring(ab, rule->filterkey); - else - audit_log_format(ab, "(null)"); - audit_log_format(ab, " list=%d res=%d", rule->listnr, res); - audit_log_end(ab); -} - -/** - * audit_receive_filter - apply all rules to the specified message type - * @type: audit message type - * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages - * @seq: netlink audit message sequence (serial) number - * @data: payload data - * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender - */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) -{ - struct task_struct *tsk; - struct audit_netlink_list *dest; - int err = 0; - struct audit_entry *entry; - - switch (type) { - case AUDIT_LIST: - case AUDIT_LIST_RULES: - /* We can't just spew out the rules here because we might fill - * the available socket buffer space and deadlock waiting for - * auditctl to read from it... which isn't ever going to - * happen if we're actually running in the context of auditctl - * trying to _send_ the stuff */ - - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); - if (!dest) - return -ENOMEM; - dest->pid = pid; - skb_queue_head_init(&dest->q); - - mutex_lock(&audit_filter_mutex); - if (type == AUDIT_LIST) - audit_list(pid, seq, &dest->q); - else - audit_list_rules(pid, seq, &dest->q); - mutex_unlock(&audit_filter_mutex); - - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); - if (IS_ERR(tsk)) { - skb_queue_purge(&dest->q); - kfree(dest); - err = PTR_ERR(tsk); - } - break; - case AUDIT_ADD: - case AUDIT_ADD_RULE: - if (type == AUDIT_ADD) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - err = audit_add_rule(entry, - &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sessionid, sid, "add", - &entry->rule, !err); - - if (err) - audit_free_rule(entry); - break; - case AUDIT_DEL: - case AUDIT_DEL_RULE: - if (type == AUDIT_DEL) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - err = audit_del_rule(entry, - &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sessionid, sid, "remove", - &entry->rule, !err); - - audit_free_rule(entry); - break; - default: - return -EINVAL; - } - - return err; -} - -int audit_comparator(const u32 left, const u32 op, const u32 right) -{ - switch (op) { - case AUDIT_EQUAL: - return (left == right); - case AUDIT_NOT_EQUAL: - return (left != right); - case AUDIT_LESS_THAN: - return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: - return (left <= right); - case AUDIT_GREATER_THAN: - return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: - return (left >= right); - case AUDIT_BIT_MASK: - return (left & right); - case AUDIT_BIT_TEST: - return ((left & right) == right); - } - BUG(); - return 0; -} - -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) -{ - int dlen, plen; - const char *p; - - if (!dname || !path) - return 1; - - dlen = strlen(dname); - plen = strlen(path); - if (plen < dlen) - return 1; - - /* disregard trailing slashes */ - p = path + plen - 1; - while ((*p == '/') && (p > path)) - p--; - - /* find last path component */ - p = p - dlen + 1; - if (p < path) - return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } - - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; - return strncmp(p, dname, dlen); -} - -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_krule *rule, - enum audit_state *state) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - int result = 0; - - switch (f->type) { - case AUDIT_PID: - result = audit_comparator(cb->creds.pid, f->op, f->val); - break; - case AUDIT_UID: - result = audit_comparator(cb->creds.uid, f->op, f->val); - break; - case AUDIT_GID: - result = audit_comparator(cb->creds.gid, f->op, f->val); - break; - case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, f->op, f->val); - break; - } - - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(struct netlink_skb_parms *cb) -{ - enum audit_state state = AUDIT_DISABLED; - struct audit_entry *e; - int ret = 1; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { - if (state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; /* Audit by default */ -} - -int audit_filter_type(int type) -{ - struct audit_entry *e; - int result = 0; - - rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) - goto unlock_and_return; - - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - int i; - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_MSGTYPE) { - result = audit_comparator(type, f->op, f->val); - if (!result) - break; - } - } - if (result) - goto unlock_and_return; - } -unlock_and_return: - rcu_read_unlock(); - return result; -} - -/* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM - * specific filter fields. When such a rule is found, it is copied, the - * LSM field is re-initialized, and the old rule is replaced with the - * updated rule. */ -int audit_update_lsm_rules(void) -{ - struct audit_entry *entry, *n, *nentry; - struct audit_watch *watch; - struct audit_tree *tree; - int i, err = 0; - - /* audit_filter_mutex synchronizes the writers */ - mutex_lock(&audit_filter_mutex); - - for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!security_audit_rule_known(&entry->rule)) - continue; - - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - if (!err) - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (watch) - list_del(&entry->rule.rlist); - list_del_rcu(&entry->list); - } else { - if (watch) { - list_add(&nentry->rule.rlist, - &watch->rules); - list_del(&entry->rule.rlist); - } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); - } - } - - mutex_unlock(&audit_filter_mutex); - - return err; -} - -/* Update watch data in audit rules based on inotify events. */ -void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) -{ - struct audit_parent *parent; - - parent = container_of(i_watch, struct audit_parent, wdata); - - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { - audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); -} -/* auditsc.c -- System-call auditing support - * Handles all system-call specific auditing features. - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005, 2006 IBM Corporation - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Written by Rickard E. (Rik) Faith - * - * Many of the ideas implemented here are from Stephen C. Tweedie, - * especially the idea of avoiding a copy by using getname. - * - * The method for actual interception of syscall entry and exit (not in - * this file -- see entry.S) is based on a GPL'd patch written by - * okir@suse.de and Copyright 2003 SuSE Linux AG. - * - * POSIX message queue support added by George Wilson , - * 2006. - * - * The support of additional filter rules compares (>, <, >=, <=) was - * added by Dustin Kirkland , 2005. - * - * Modified by Amy Griffis to collect additional - * filesystem information. - * - * Subject and object context labeling support added by - * and for LSPP certification compliance. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "audit.h" - -/* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 - -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - -/* no execve audit message should be longer than this (userspace limits) */ -#define MAX_EXECVE_AUDIT_LEN 7500 - -/* number of audit rules */ -int audit_n_rules; - -/* determines whether we collect data for signals sent */ -int audit_signals; - -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). - * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ -struct audit_names { - const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ - unsigned long ino; - dev_t dev; - umode_t mode; - uid_t uid; - gid_t gid; - dev_t rdev; - u32 osid; -}; - -struct audit_aux_data { - struct audit_aux_data *next; - int type; -}; - -#define AUDIT_AUX_IPCPERM 0 - -/* Number of target pids per aux struct. */ -#define AUDIT_AUX_PIDS 16 - -struct audit_aux_data_mq_open { - struct audit_aux_data d; - int oflag; - mode_t mode; - struct mq_attr attr; -}; - -struct audit_aux_data_mq_sendrecv { - struct audit_aux_data d; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; -}; - -struct audit_aux_data_mq_notify { - struct audit_aux_data d; - mqd_t mqdes; - struct sigevent notification; -}; - -struct audit_aux_data_mq_getsetattr { - struct audit_aux_data d; - mqd_t mqdes; - struct mq_attr mqstat; -}; - -struct audit_aux_data_ipcctl { - struct audit_aux_data d; - struct ipc_perm p; - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; - u32 osid; -}; - -struct audit_aux_data_execve { - struct audit_aux_data d; - int argc; - int envc; - struct mm_struct *mm; -}; - -struct audit_aux_data_socketcall { - struct audit_aux_data d; - int nargs; - unsigned long args[0]; -}; - -struct audit_aux_data_sockaddr { - struct audit_aux_data d; - int len; - char a[0]; -}; - -struct audit_aux_data_fd_pair { - struct audit_aux_data d; - int fd[2]; -}; - -struct audit_aux_data_pids { - struct audit_aux_data d; - pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; - uid_t target_uid[AUDIT_AUX_PIDS]; - unsigned int target_sessionid[AUDIT_AUX_PIDS]; - u32 target_sid[AUDIT_AUX_PIDS]; - char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; - int pid_count; -}; - -struct audit_tree_refs { - struct audit_tree_refs *next; - struct audit_chunk *c[31]; -}; - -/* The per-task audit context. */ -struct audit_context { - int dummy; /* must be the first element */ - int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state; - unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ - int major; /* syscall number */ - unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ - long return_code;/* syscall return code */ - int auditable; /* 1 if record should be written */ - int name_count; - struct audit_names names[AUDIT_NAMES]; - char * filterkey; /* key for rule that triggered record */ - struct path pwd; - struct audit_context *previous; /* For nested syscalls */ - struct audit_aux_data *aux; - struct audit_aux_data *aux_pids; - - /* Save things to print about task_struct */ - pid_t pid, ppid; - uid_t uid, euid, suid, fsuid; - gid_t gid, egid, sgid, fsgid; - unsigned long personality; - int arch; - - pid_t target_pid; - uid_t target_auid; - uid_t target_uid; - unsigned int target_sessionid; - u32 target_sid; - char target_comm[TASK_COMM_LEN]; - - struct audit_tree_refs *trees, *first_trees; - int tree_count; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif -}; - -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -static inline int open_arg(int flags, int mask) -{ - int n = ACC_MODE(flags); - if (flags & (O_TRUNC | O_CREAT)) - n |= AUDIT_PERM_WRITE; - return n & mask; -} - -static int audit_match_perm(struct audit_context *ctx, int mask) -{ - unsigned n; - if (unlikely(!ctx)) - return 0; - - n = ctx->major; - switch (audit_classify_syscall(ctx->arch, n)) { - case 0: /* native */ - if ((mask & AUDIT_PERM_WRITE) && - audit_match_class(AUDIT_CLASS_WRITE, n)) - return 1; - if ((mask & AUDIT_PERM_READ) && - audit_match_class(AUDIT_CLASS_READ, n)) - return 1; - if ((mask & AUDIT_PERM_ATTR) && - audit_match_class(AUDIT_CLASS_CHATTR, n)) - return 1; - return 0; - case 1: /* 32bit on biarch */ - if ((mask & AUDIT_PERM_WRITE) && - audit_match_class(AUDIT_CLASS_WRITE_32, n)) - return 1; - if ((mask & AUDIT_PERM_READ) && - audit_match_class(AUDIT_CLASS_READ_32, n)) - return 1; - if ((mask & AUDIT_PERM_ATTR) && - audit_match_class(AUDIT_CLASS_CHATTR_32, n)) - return 1; - return 0; - case 2: /* open */ - return mask & ACC_MODE(ctx->argv[1]); - case 3: /* openat */ - return mask & ACC_MODE(ctx->argv[2]); - case 4: /* socketcall */ - return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); - case 5: /* execve */ - return mask & AUDIT_PERM_EXEC; - default: - return 0; - } -} - -static int audit_match_filetype(struct audit_context *ctx, int which) -{ - unsigned index = which & ~S_IFMT; - mode_t mode = which & S_IFMT; - - if (unlikely(!ctx)) - return 0; - - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; -} - -/* - * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; - * ->first_trees points to its beginning, ->trees - to the current end of data. - * ->tree_count is the number of free entries in array pointed to by ->trees. - * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, - * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, - * it's going to remain 1-element for almost any setup) until we free context itself. - * References in it _are_ dropped - at the same time we free/drop aux stuff. - */ - -#ifdef CONFIG_AUDIT_TREE -static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) -{ - struct audit_tree_refs *p = ctx->trees; - int left = ctx->tree_count; - if (likely(left)) { - p->c[--left] = chunk; - ctx->tree_count = left; - return 1; - } - if (!p) - return 0; - p = p->next; - if (p) { - p->c[30] = chunk; - ctx->trees = p; - ctx->tree_count = 30; - return 1; - } - return 0; -} - -static int grow_tree_refs(struct audit_context *ctx) -{ - struct audit_tree_refs *p = ctx->trees; - ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); - if (!ctx->trees) { - ctx->trees = p; - return 0; - } - if (p) - p->next = ctx->trees; - else - ctx->first_trees = ctx->trees; - ctx->tree_count = 31; - return 1; -} -#endif - -static void unroll_tree_refs(struct audit_context *ctx, - struct audit_tree_refs *p, int count) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_tree_refs *q; - int n; - if (!p) { - /* we started with empty chain */ - p = ctx->first_trees; - count = 31; - /* if the very first allocation has failed, nothing to do */ - if (!p) - return; - } - n = count; - for (q = p; q != ctx->trees; q = q->next, n = 31) { - while (n--) { - audit_put_chunk(q->c[n]); - q->c[n] = NULL; - } - } - while (n-- > ctx->tree_count) { - audit_put_chunk(q->c[n]); - q->c[n] = NULL; - } - ctx->trees = p; - ctx->tree_count = count; -#endif -} - -static void free_tree_refs(struct audit_context *ctx) -{ - struct audit_tree_refs *p, *q; - for (p = ctx->first_trees; p; p = q) { - q = p->next; - kfree(p); - } -} - -static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_tree_refs *p; - int n; - if (!tree) - return 0; - /* full ones */ - for (p = ctx->first_trees; p != ctx->trees; p = p->next) { - for (n = 0; n < 31; n++) - if (audit_tree_match(p->c[n], tree)) - return 1; - } - /* partial */ - if (p) { - for (n = ctx->tree_count; n < 31; n++) - if (audit_tree_match(p->c[n], tree)) - return 1; - } -#endif - return 0; -} - -/* Determine if any context name data matches a rule's watch data */ -/* Compare a task_struct with an audit_rule. Return 1 on match, 0 - * otherwise. */ -static int audit_filter_rules(struct task_struct *tsk, - struct audit_krule *rule, - struct audit_context *ctx, - struct audit_names *name, - enum audit_state *state) -{ - int i, j, need_sid = 1; - u32 sid; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - int result = 0; - - switch (f->type) { - case AUDIT_PID: - result = audit_comparator(tsk->pid, f->op, f->val); - break; - case AUDIT_PPID: - if (ctx) { - if (!ctx->ppid) - ctx->ppid = sys_getppid(); - result = audit_comparator(ctx->ppid, f->op, f->val); - } - break; - case AUDIT_UID: - result = audit_comparator(tsk->uid, f->op, f->val); - break; - case AUDIT_EUID: - result = audit_comparator(tsk->euid, f->op, f->val); - break; - case AUDIT_SUID: - result = audit_comparator(tsk->suid, f->op, f->val); - break; - case AUDIT_FSUID: - result = audit_comparator(tsk->fsuid, f->op, f->val); - break; - case AUDIT_GID: - result = audit_comparator(tsk->gid, f->op, f->val); - break; - case AUDIT_EGID: - result = audit_comparator(tsk->egid, f->op, f->val); - break; - case AUDIT_SGID: - result = audit_comparator(tsk->sgid, f->op, f->val); - break; - case AUDIT_FSGID: - result = audit_comparator(tsk->fsgid, f->op, f->val); - break; - case AUDIT_PERS: - result = audit_comparator(tsk->personality, f->op, f->val); - break; - case AUDIT_ARCH: - if (ctx) - result = audit_comparator(ctx->arch, f->op, f->val); - break; - - case AUDIT_EXIT: - if (ctx && ctx->return_valid) - result = audit_comparator(ctx->return_code, f->op, f->val); - break; - case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) { - if (f->val) - result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); - else - result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); - } - break; - case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_INODE: - if (name) - result = (name->ino == f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_WATCH: - if (name && rule->watch->ino != (unsigned long)-1) - result = (name->dev == rule->watch->dev && - name->ino == rule->watch->ino); - break; - case AUDIT_DIR: - if (ctx) - result = match_tree_refs(ctx, rule->tree); - break; - case AUDIT_LOGINUID: - result = 0; - if (ctx) - result = audit_comparator(tsk->loginuid, f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - /* NOTE: this may return negative values indicating - a temporary error. We simply treat this as a - match for now to avoid losing information that - may be wanted. An error message will also be - logged upon error */ - if (f->lsm_rule) { - if (need_sid) { - security_task_getsecid(tsk, &sid); - need_sid = 0; - } - result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); - } - break; - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR - also applies here */ - if (f->lsm_rule) { - /* Find files that match */ - if (name) { - result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); - } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - /* Find ipc objects that match */ - if (ctx) { - struct audit_aux_data *aux; - for (aux = ctx->aux; aux; - aux = aux->next) { - if (aux->type == AUDIT_IPC) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - } - } - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - if (ctx) - result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); - break; - case AUDIT_FILTERKEY: - /* ignore this field for filtering */ - result = 1; - break; - case AUDIT_PERM: - result = audit_match_perm(ctx, f->val); - break; - case AUDIT_FILETYPE: - result = audit_match_filetype(ctx, f->val); - break; - } - - if (!result) - return 0; - } - if (rule->filterkey && ctx) - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -/* At process creation time, we can determine if system-call auditing is - * completely disabled for this task. Since we only have the task - * structure at this point, we can only check uid and gid. - */ -static enum audit_state audit_filter_task(struct task_struct *tsk) -{ - struct audit_entry *e; - enum audit_state state; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { - rcu_read_unlock(); - return state; - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -/* At syscall entry and exit time, this filter is called if the - * audit_state is not low enough that auditing cannot take place, but is - * also not high enough that we already know we have to write an audit - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). - */ -static enum audit_state audit_filter_syscall(struct task_struct *tsk, - struct audit_context *ctx, - struct list_head *list) -{ - struct audit_entry *e; - enum audit_state state; - - if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; - - rcu_read_lock(); - if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state)) { - rcu_read_unlock(); - return state; - } - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -/* At syscall exit time, this filter is called if any audit_names[] have been - * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. - * Regarding audit_state, same rules apply as for audit_filter_syscall(). - */ -enum audit_state audit_filter_inodes(struct task_struct *tsk, - struct audit_context *ctx) -{ - int i; - struct audit_entry *e; - enum audit_state state; - - if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; - - rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - - if (list_empty(list)) - continue; - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { - rcu_read_unlock(); - return state; - } - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -void audit_set_auditable(struct audit_context *ctx) -{ - ctx->auditable = 1; -} - -static inline struct audit_context *audit_get_context(struct task_struct *tsk, - int return_valid, - int return_code) -{ - struct audit_context *context = tsk->audit_context; - - if (likely(!context)) - return NULL; - context->return_valid = return_valid; - - /* - * we need to fix up the return code in the audit logs if the actual - * return codes are later going to be fixed up by the arch specific - * signal handlers - * - * This is actually a test for: - * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || - * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) - * - * but is faster than a bunch of || - */ - if (unlikely(return_code <= -ERESTARTSYS) && - (return_code >= -ERESTART_RESTARTBLOCK) && - (return_code != -ENOIOCTLCMD)) - context->return_code = -EINTR; - else - context->return_code = return_code; - - if (context->in_syscall && !context->dummy && !context->auditable) { - enum audit_state state; - - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - if (state == AUDIT_RECORD_CONTEXT) { - context->auditable = 1; - goto get_context; - } - - state = audit_filter_inodes(tsk, context); - if (state == AUDIT_RECORD_CONTEXT) - context->auditable = 1; - - } - -get_context: - - tsk->audit_context = NULL; - return context; -} - -static inline void audit_free_names(struct audit_context *context) -{ - int i; - -#if AUDIT_DEBUG == 2 - if (context->auditable - ||context->put_count + context->ino_count != context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d" - " ino_count=%d [NOT freeing]\n", - __FILE__, __LINE__, - context->serial, context->major, context->in_syscall, - context->name_count, context->put_count, - context->ino_count); - for (i = 0; i < context->name_count; i++) { - printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } - dump_stack(); - return; - } -#endif -#if AUDIT_DEBUG - context->put_count = 0; - context->ino_count = 0; -#endif - - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); - } - context->name_count = 0; - path_put(&context->pwd); - context->pwd.dentry = NULL; - context->pwd.mnt = NULL; -} - -static inline void audit_free_aux(struct audit_context *context) -{ - struct audit_aux_data *aux; - - while ((aux = context->aux)) { - context->aux = aux->next; - kfree(aux); - } - while ((aux = context->aux_pids)) { - context->aux_pids = aux->next; - kfree(aux); - } -} - -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; -} - -static inline struct audit_context *audit_alloc_context(enum audit_state state) -{ - struct audit_context *context; - - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) - return NULL; - audit_zero_context(context, state); - return context; -} - -/** - * audit_alloc - allocate an audit context block for a task - * @tsk: task - * - * Filter on the task information and allocate a per-task audit context - * if necessary. Doing so turns on system call auditing for the - * specified task. This is called from copy_process, so no lock is - * needed. - */ -int audit_alloc(struct task_struct *tsk) -{ - struct audit_context *context; - enum audit_state state; - - if (likely(!audit_ever_enabled)) - return 0; /* Return if not auditing. */ - - state = audit_filter_task(tsk); - if (likely(state == AUDIT_DISABLED)) - return 0; - - if (!(context = audit_alloc_context(state))) { - audit_log_lost("out of memory in audit_alloc"); - return -ENOMEM; - } - - tsk->audit_context = context; - set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); - return 0; -} - -static inline void audit_free_context(struct audit_context *context) -{ - struct audit_context *previous; - int count = 0; - - do { - previous = context->previous; - if (previous || (count && count < 10)) { - ++count; - printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" - " freeing multiple contexts (%d)\n", - context->serial, context->major, - context->name_count, count); - } - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - free_tree_refs(context); - audit_free_aux(context); - kfree(context->filterkey); - kfree(context); - context = previous; - } while (context); - if (count >= 10) - printk(KERN_ERR "audit: freed %d contexts\n", count); -} - -void audit_log_task_context(struct audit_buffer *ab) -{ - char *ctx = NULL; - unsigned len; - int error; - u32 sid; - - security_task_getsecid(current, &sid); - if (!sid) - return; - - error = security_secid_to_secctx(sid, &ctx, &len); - if (error) { - if (error != -EINVAL) - goto error_path; - return; - } - - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - return; - -error_path: - audit_panic("error in audit_log_task_context"); - return; -} - -EXPORT_SYMBOL(audit_log_task_context); - -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) -{ - char name[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; - - /* tsk == current */ - - get_task_comm(name, tsk); - audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); - - if (mm) { - down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, "exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - audit_log_task_context(ab); -} - -static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, - u32 sid, char *comm) -{ - struct audit_buffer *ab; - char *ctx = NULL; - u32 len; - int rc = 0; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); - if (!ab) - return rc; - - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, - uid, sessionid); - if (security_secid_to_secctx(sid, &ctx, &len)) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - audit_log_format(ab, " ocomm="); - audit_log_untrustedstring(ab, comm); - audit_log_end(ab); - - return rc; -} - -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundry) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 3 is the length of a=\n */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, "a%d_len=%zu ", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, "a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_format(*ab, "\"%s\"", buf); - audit_log_format(*ab, "\n"); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab, - struct audit_aux_data_execve *axi) -{ - int i; - size_t len, len_sent = 0; - const char __user *p; - char *buf; - - if (axi->mm != current->mm) - return; /* execve failed, no additional info */ - - p = (const char __user *)axi->mm->arg_start; - - audit_log_format(*ab, "argc=%d ", axi->argc); - - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string\n"); - return; - } - - for (i = 0; i < axi->argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); -} - -static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) -{ - int i, call_panic = 0; - struct audit_buffer *ab; - struct audit_aux_data *aux; - const char *tty; - - /* tsk == current */ - context->pid = tsk->pid; - if (!context->ppid) - context->ppid = sys_getppid(); - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; - context->personality = tsk->personality; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); - if (!ab) - return; /* audit_panic has been called */ - audit_log_format(ab, "arch=%x syscall=%d", - context->arch, context->major); - if (context->personality != PER_LINUX) - audit_log_format(ab, " per=%lx", context->personality); - if (context->return_valid) - audit_log_format(ab, " success=%s exit=%ld", - (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", - context->return_code); - - mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - read_unlock(&tasklist_lock); - audit_log_format(ab, - " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " ppid=%d pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - context->argv[0], - context->argv[1], - context->argv[2], - context->argv[3], - context->name_count, - context->ppid, - context->pid, - tsk->loginuid, - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, - tsk->sessionid); - - mutex_unlock(&tty_mutex); - - audit_log_task_info(ab, tsk); - if (context->filterkey) { - audit_log_format(ab, " key="); - audit_log_untrustedstring(ab, context->filterkey); - } else - audit_log_format(ab, " key=(null)"); - audit_log_end(ab); - - for (aux = context->aux; aux; aux = aux->next) { - - ab = audit_log_start(context, GFP_KERNEL, aux->type); - if (!ab) - continue; /* audit_panic has been called */ - - switch (aux->type) { - case AUDIT_MQ_OPEN: { - struct audit_aux_data_mq_open *axi = (void *)aux; - audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - axi->oflag, axi->mode, axi->attr.mq_flags, - axi->attr.mq_maxmsg, axi->attr.mq_msgsize, - axi->attr.mq_curmsgs); - break; } - - case AUDIT_MQ_SENDRECV: { - struct audit_aux_data_mq_sendrecv *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - axi->mqdes, axi->msg_len, axi->msg_prio, - axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); - break; } - - case AUDIT_MQ_NOTIFY: { - struct audit_aux_data_mq_notify *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d sigev_signo=%d", - axi->mqdes, - axi->notification.sigev_signo); - break; } - - case AUDIT_MQ_GETSETATTR: { - struct audit_aux_data_mq_getsetattr *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - axi->mqdes, - axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, - axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); - break; } - - case AUDIT_IPC: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "ouid=%u ogid=%u mode=%#o", - axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - break; } - - case AUDIT_IPC_SET_PERM: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", - axi->qbytes, axi->uid, axi->gid, axi->mode); - break; } - - case AUDIT_EXECVE: { - struct audit_aux_data_execve *axi = (void *)aux; - audit_log_execve_info(context, &ab, axi); - break; } - - case AUDIT_SOCKETCALL: { - struct audit_aux_data_socketcall *axs = (void *)aux; - audit_log_format(ab, "nargs=%d", axs->nargs); - for (i=0; inargs; i++) - audit_log_format(ab, " a%d=%lx", i, axs->args[i]); - break; } - - case AUDIT_SOCKADDR: { - struct audit_aux_data_sockaddr *axs = (void *)aux; - - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, axs->a, axs->len); - break; } - - case AUDIT_FD_PAIR: { - struct audit_aux_data_fd_pair *axs = (void *)aux; - audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); - break; } - - } - audit_log_end(ab); - } - - for (aux = context->aux_pids; aux; aux = aux->next) { - struct audit_aux_data_pids *axs = (void *)aux; - - for (i = 0; i < axs->pid_count; i++) - if (audit_log_pid_context(context, axs->target_pid[i], - axs->target_auid[i], - axs->target_uid[i], - axs->target_sessionid[i], - axs->target_sid[i], - axs->target_comm[i])) - call_panic = 1; - } - - if (context->target_pid && - audit_log_pid_context(context, context->target_pid, - context->target_auid, context->target_uid, - context->target_sessionid, - context->target_sid, context->target_comm)) - call_panic = 1; - - if (context->pwd.dentry && context->pwd.mnt) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); - if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); - audit_log_end(ab); - } - } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_end(ab); - } - - /* Send end of event record to help user space know we are finished */ - ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); - if (ab) - audit_log_end(ab); - if (call_panic) - audit_panic("error converting sid to string"); -} - -/** - * audit_free - free a per-task audit context - * @tsk: task whose audit context block to free - * - * Called from copy_process and do_exit - */ -void audit_free(struct task_struct *tsk) -{ - struct audit_context *context; - - context = audit_get_context(tsk, 0, 0); - if (likely(!context)) - return; - - /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. - * We use GFP_ATOMIC here because we might be doing this - * in the context of the idle thread */ - /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->auditable) - audit_log_exit(context, tsk); - - audit_free_context(context); -} - -/** - * audit_syscall_entry - fill in an audit record at syscall entry - * @tsk: task being audited - * @arch: architecture type - * @major: major syscall type (function) - * @a1: additional syscall register 1 - * @a2: additional syscall register 2 - * @a3: additional syscall register 3 - * @a4: additional syscall register 4 - * - * Fill in audit context at syscall entry. This only happens if the - * audit context was created when the task was created and the state or - * filters demand the audit context be built. If the state from the - * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, - * then the record will be written at syscall exit time (otherwise, it - * will only be written if another part of the kernel requests that it - * be written). - */ -void audit_syscall_entry(int arch, int major, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4) -{ - struct task_struct *tsk = current; - struct audit_context *context = tsk->audit_context; - enum audit_state state; - - if (unlikely(!context)) - return; - - /* - * This happens only on certain architectures that make system - * calls in kernel_thread via the entry.S interface, instead of - * with direct calls. (If you are porting to a new - * architecture, hitting this condition can indicate that you - * got the _exit/_leave calls backward in entry.S.) - * - * i386 no - * x86_64 no - * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) - * - * This also happens with vm86 emulation in a non-nested manner - * (entries without exits), so this case must be caught. - */ - if (context->in_syscall) { - struct audit_context *newctx; - -#if AUDIT_DEBUG - printk(KERN_ERR - "audit(:%d) pid=%d in syscall=%d;" - " entering syscall=%d\n", - context->serial, tsk->pid, context->major, major); -#endif - newctx = audit_alloc_context(context->state); - if (newctx) { - newctx->previous = context; - context = newctx; - tsk->audit_context = newctx; - } else { - /* If we can't alloc a new context, the best we - * can do is to leak memory (any pending putname - * will be lost). The only other alternative is - * to abandon auditing. */ - audit_zero_context(context, context->state); - } - } - BUG_ON(context->in_syscall || context->name_count); - - if (!audit_enabled) - return; - - context->arch = arch; - context->major = major; - context->argv[0] = a1; - context->argv[1] = a2; - context->argv[2] = a3; - context->argv[3] = a4; - - state = context->state; - context->dummy = !audit_n_rules; - if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); - if (likely(state == AUDIT_DISABLED)) - return; - - context->serial = 0; - context->ctime = CURRENT_TIME; - context->in_syscall = 1; - context->auditable = !!(state == AUDIT_RECORD_CONTEXT); - context->ppid = 0; -} - -/** - * audit_syscall_exit - deallocate audit context after a system call - * @tsk: task being audited - * @valid: success/failure flag - * @return_code: syscall return value - * - * Tear down after system call. If the audit context has been marked as - * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel write an audit - * message), then write out the syscall information. In call cases, - * free the names stored from getname(). - */ -void audit_syscall_exit(int valid, long return_code) -{ - struct task_struct *tsk = current; - struct audit_context *context; - - context = audit_get_context(tsk, valid, return_code); - - if (likely(!context)) - return; - - if (context->in_syscall && context->auditable) - audit_log_exit(context, tsk); - - context->in_syscall = 0; - context->auditable = 0; - - if (context->previous) { - struct audit_context *new_context = context->previous; - context->previous = NULL; - audit_free_context(context); - tsk->audit_context = new_context; - } else { - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - audit_free_aux(context); - context->aux = NULL; - context->aux_pids = NULL; - context->target_pid = 0; - context->target_sid = 0; - kfree(context->filterkey); - context->filterkey = NULL; - tsk->audit_context = context; - } -} - -static inline void handle_one(const struct inode *inode) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_context *context; - struct audit_tree_refs *p; - struct audit_chunk *chunk; - int count; - if (likely(list_empty(&inode->inotify_watches))) - return; - context = current->audit_context; - p = context->trees; - count = context->tree_count; - rcu_read_lock(); - chunk = audit_tree_lookup(inode); - rcu_read_unlock(); - if (!chunk) - return; - if (likely(put_tree_ref(context, chunk))) - return; - if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); - audit_set_auditable(context); - audit_put_chunk(chunk); - unroll_tree_refs(context, p, count); - return; - } - put_tree_ref(context, chunk); -#endif -} - -static void handle_path(const struct dentry *dentry) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_context *context; - struct audit_tree_refs *p; - const struct dentry *d, *parent; - struct audit_chunk *drop; - unsigned long seq; - int count; - - context = current->audit_context; - p = context->trees; - count = context->tree_count; -retry: - drop = NULL; - d = dentry; - rcu_read_lock(); - seq = read_seqbegin(&rename_lock); - for(;;) { - struct inode *inode = d->d_inode; - if (inode && unlikely(!list_empty(&inode->inotify_watches))) { - struct audit_chunk *chunk; - chunk = audit_tree_lookup(inode); - if (chunk) { - if (unlikely(!put_tree_ref(context, chunk))) { - drop = chunk; - break; - } - } - } - parent = d->d_parent; - if (parent == d) - break; - d = parent; - } - if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ - rcu_read_unlock(); - if (!drop) { - /* just a race with rename */ - unroll_tree_refs(context, p, count); - goto retry; - } - audit_put_chunk(drop); - if (grow_tree_refs(context)) { - /* OK, got more space */ - unroll_tree_refs(context, p, count); - goto retry; - } - /* too bad */ - printk(KERN_WARNING - "out of memory, audit has lost a tree reference\n"); - unroll_tree_refs(context, p, count); - audit_set_auditable(context); - return; - } - rcu_read_unlock(); -#endif -} - -/** - * audit_getname - add a name to the list - * @name: name to add - * - * Add a name to the list of audit names for this context. - * Called from fs/namei.c:getname(). - */ -void __audit_getname(const char *name) -{ - struct audit_context *context = current->audit_context; - - if (IS_ERR(name) || !name) - return; - - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", - __FILE__, __LINE__, context->serial, name); - dump_stack(); -#endif - return; - } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; - if (!context->pwd.dentry) { - read_lock(¤t->fs->lock); - context->pwd = current->fs->pwd; - path_get(¤t->fs->pwd); - read_unlock(¤t->fs->lock); - } - -} - -/* audit_putname - intercept a putname request - * @name: name to intercept and delay for putname - * - * If we have stored the name from getname in the audit context, - * then we delay the putname until syscall exit. - * Called from include/linux/fs.h:putname(). - */ -void audit_putname(const char *name) -{ - struct audit_context *context = current->audit_context; - - BUG_ON(!context); - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", - __FILE__, __LINE__, context->serial, name); - if (context->name_count) { - int i; - for (i = 0; i < context->name_count; i++) - printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } -#endif - __putname(name); - } -#if AUDIT_DEBUG - else { - ++context->put_count; - if (context->put_count > context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d" - " in_syscall=%d putname(%p) name_count=%d" - " put_count=%d\n", - __FILE__, __LINE__, - context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); - dump_stack(); - } - } -#endif -} - -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - -/* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); -} - -/** - * audit_inode - store the inode and device from a lookup - * @name: name being audited - * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). - */ -void __audit_inode(const char *name, const struct dentry *dentry) -{ - int idx; - struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; - - if (!context->in_syscall) - return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - } - handle_path(dentry); - audit_copy_inode(&context->names[idx], inode); -} - -/** - * audit_inode_child - collect inode info for created/removed objects - * @dname: inode's dentry name - * @dentry: dentry being audited - * @parent: inode of dentry parent - * - * For syscalls that create or remove filesystem objects, audit_inode - * can only collect information for the filesystem object's parent. - * This call updates the audit context with the child's information. - * Syscalls that create a new filesystem object must be hooked after - * the object is created. Syscalls that remove a filesystem object - * must be hooked prior, in order to capture the target inode during - * unsuccessful attempts. - */ -void __audit_inode_child(const char *dname, const struct dentry *dentry, - const struct inode *parent) -{ - int idx; - struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; - const struct inode *inode = dentry->d_inode; - int dirlen = 0; - - if (!context->in_syscall) - return; - - if (inode) - handle_one(inode); - /* determine matching parent */ - if (!dname) - goto add_names; - - /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - - if (!n->name) - continue; - - if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ - found_parent = n->name; - goto add_names; - } - } - - /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - - if (!n->name) - continue; - - /* strcmp() is the more likely scenario */ - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { - if (inode) - audit_copy_inode(n, inode); - else - n->ino = (unsigned long)-1; - found_child = n->name; - goto add_names; - } - } - -add_names: - if (!found_parent) { - if (audit_inc_name_count(context, parent)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], parent); - } - - if (!found_child) { - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - - /* Re-use the name belonging to the slot for a matching parent - * directory. All names for this context are relinquished in - * audit_free_names() */ - if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; - /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; - } - - if (inode) - audit_copy_inode(&context->names[idx], inode); - else - context->names[idx].ino = (unsigned long)-1; - } -} -EXPORT_SYMBOL_GPL(__audit_inode_child); - -/** - * auditsc_get_stamp - get local copies of audit_context values - * @ctx: audit_context for the task - * @t: timespec to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context - * - * Also sets the context as auditable. - */ -void auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) -{ - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; - ctx->auditable = 1; -} - -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -/** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) -{ - unsigned int sessionid = atomic_inc_return(&session_id); - struct audit_context *context = task->audit_context; - - if (context && context->in_syscall) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u" - " old ses=%u new ses=%u", - task->pid, task->uid, - task->loginuid, loginuid, - task->sessionid, sessionid); - audit_log_end(ab); - } - } - task->sessionid = sessionid; - task->loginuid = loginuid; - return 0; -} - -/** - * __audit_mq_open - record audit data for a POSIX MQ open - * @oflag: open flag - * @mode: mode bits - * @u_attr: queue attributes - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) -{ - struct audit_aux_data_mq_open *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_attr != NULL) { - if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->attr, 0, sizeof(ax->attr)); - - ax->oflag = oflag; - ax->mode = mode; - - ax->d.type = AUDIT_MQ_OPEN; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedsend - record audit data for a POSIX MQ timed send - * @mqdes: MQ descriptor - * @msg_len: Message length - * @msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - ax->msg_prio = msg_prio; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @u_msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, - unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_msg_prio != NULL) { - if (get_user(ax->msg_prio, u_msg_prio)) { - kfree(ax); - return -EFAULT; - } - } else - ax->msg_prio = 0; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_notify - record audit data for a POSIX MQ notify - * @mqdes: MQ descriptor - * @u_notification: Notification event - * - * Returns 0 for success or NULL context or < 0 on error. - */ - -int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) -{ - struct audit_aux_data_mq_notify *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_notification != NULL) { - if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->notification, 0, sizeof(ax->notification)); - - ax->mqdes = mqdes; - - ax->d.type = AUDIT_MQ_NOTIFY; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute - * @mqdes: MQ descriptor - * @mqstat: MQ flags - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) -{ - struct audit_aux_data_mq_getsetattr *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->mqdes = mqdes; - ax->mqstat = *mqstat; - - ax->d.type = AUDIT_MQ_GETSETATTR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * audit_ipc_obj - record audit data for ipc object - * @ipcp: ipc permissions - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_ipc_obj(struct kern_ipc_perm *ipcp) -{ - struct audit_aux_data_ipcctl *ax; - struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->uid = ipcp->uid; - ax->gid = ipcp->gid; - ax->mode = ipcp->mode; - security_ipc_getsecid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * audit_ipc_set_perm - record audit data for new ipc permissions - * @qbytes: msgq bytes - * @uid: msgq user id - * @gid: msgq group id - * @mode: msgq mode (permissions) - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) -{ - struct audit_aux_data_ipcctl *ax; - struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->qbytes = qbytes; - ax->uid = uid; - ax->gid = gid; - ax->mode = mode; - - ax->d.type = AUDIT_IPC_SET_PERM; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -int audit_bprm(struct linux_binprm *bprm) -{ - struct audit_aux_data_execve *ax; - struct audit_context *context = current->audit_context; - - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->argc = bprm->argc; - ax->envc = bprm->envc; - ax->mm = bprm->mm; - ax->d.type = AUDIT_EXECVE; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - - -/** - * audit_socketcall - record audit data for sys_socketcall - * @nargs: number of args - * @args: args array - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int audit_socketcall(int nargs, unsigned long *args) -{ - struct audit_aux_data_socketcall *ax; - struct audit_context *context = current->audit_context; - - if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->nargs = nargs; - memcpy(ax->args, args, nargs * sizeof(unsigned long)); - - ax->d.type = AUDIT_SOCKETCALL; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_fd_pair - record audit data for pipe and socketpair - * @fd1: the first file descriptor - * @fd2: the second file descriptor - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_fd_pair(int fd1, int fd2) -{ - struct audit_context *context = current->audit_context; - struct audit_aux_data_fd_pair *ax; - - if (likely(!context)) { - return 0; - } - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) { - return -ENOMEM; - } - - ax->fd[0] = fd1; - ax->fd[1] = fd2; - - ax->d.type = AUDIT_FD_PAIR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto - * @len: data length in user space - * @a: data address in kernel space - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int audit_sockaddr(int len, void *a) -{ - struct audit_aux_data_sockaddr *ax; - struct audit_context *context = current->audit_context; - - if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->len = len; - memcpy(ax->a, a, len); - - ax->d.type = AUDIT_SOCKADDR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -void __audit_ptrace(struct task_struct *t) -{ - struct audit_context *context = current->audit_context; - - context->target_pid = t->pid; - context->target_auid = audit_get_loginuid(t); - context->target_uid = t->uid; - context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &context->target_sid); - memcpy(context->target_comm, t->comm, TASK_COMM_LEN); -} - -/** - * audit_signal_info - record signal info for shutting down audit subsystem - * @sig: signal value - * @t: task being signaled - * - * If the audit subsystem is being terminated, record the task (pid) - * and uid that is doing that. - */ -int __audit_signal_info(int sig, struct task_struct *t) -{ - struct audit_aux_data_pids *axp; - struct task_struct *tsk = current; - struct audit_context *ctx = tsk->audit_context; - - if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) - audit_sig_uid = tsk->loginuid; - else - audit_sig_uid = tsk->uid; - security_task_getsecid(tsk, &audit_sig_sid); - } - if (!audit_signals || audit_dummy_context()) - return 0; - } - - /* optimize the common case by putting first signal recipient directly - * in audit_context */ - if (!ctx->target_pid) { - ctx->target_pid = t->tgid; - ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t->uid; - ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &ctx->target_sid); - memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); - return 0; - } - - axp = (void *)ctx->aux_pids; - if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { - axp = kzalloc(sizeof(*axp), GFP_ATOMIC); - if (!axp) - return -ENOMEM; - - axp->d.type = AUDIT_OBJ_PID; - axp->d.next = ctx->aux_pids; - ctx->aux_pids = (void *)axp; - } - BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); - - axp->target_pid[axp->pid_count] = t->tgid; - axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t->uid; - axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid(t, &axp->target_sid[axp->pid_count]); - memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); - axp->pid_count++; - - return 0; -} - -/** - * audit_core_dumps - record information about processes that end abnormally - * @signr: signal value - * - * If a process ends with a core dump, something fishy is going on and we - * should record the event for investigation. - */ -void audit_core_dumps(long signr) -{ - struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current); - unsigned int sessionid = audit_get_sessionid(current); - - if (!audit_enabled) - return; - - if (signr == SIGQUIT) /* don't care for those */ - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, current->uid, current->gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; - - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); - audit_log_end(ab); -} -/* - * Simple stack backtrace regression test module - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include -#include -#include -#include -#include - -static void backtrace_test_normal(void) -{ - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - dump_stack(); -} - -static DECLARE_COMPLETION(backtrace_work); - -static void backtrace_test_irq_callback(unsigned long data) -{ - dump_stack(); - complete(&backtrace_work); -} - -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); - -static void backtrace_test_irq(void) -{ - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - init_completion(&backtrace_work); - tasklet_schedule(&backtrace_tasklet); - wait_for_completion(&backtrace_work); -} - -#ifdef CONFIG_STACKTRACE -static void backtrace_test_saved(void) -{ - struct stack_trace trace; - unsigned long entries[8]; - - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - trace.nr_entries = 0; - trace.max_entries = ARRAY_SIZE(entries); - trace.entries = entries; - trace.skip = 0; - - save_stack_trace(&trace); - print_stack_trace(&trace, 0); -} -#else -static void backtrace_test_saved(void) -{ - printk("Saved backtrace test skipped.\n"); -} -#endif - -static int backtrace_regression_test(void) -{ - printk("====[ backtrace testing ]===========\n"); - - backtrace_test_normal(); - backtrace_test_irq(); - backtrace_test_saved(); - - printk("====[ end of backtrace testing ]====\n"); - return 0; -} - -static void exitf(void) -{ -} - -module_init(backtrace_regression_test); -module_exit(exitf); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arjan van de Ven "); -/* - * Generate definitions needed by the preprocessor. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#define __GENERATING_BOUNDS_H -/* Include headers that define the enum constants of interest */ -#include -#include -#include - -void foo(void) -{ - /* The enum constants to put into include/linux/bounds.h */ - DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); - DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - /* End of constants */ -} -/* - * linux/kernel/capability.c - * - * Copyright (C) 1997 Andrew Main - * - * Integrated into 2.1.97+, Andrew G. Morgan - * 30 May 2002: Cleanup, Robert M. Love - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * This lock protects task->cap_* for all tasks including current. - * Locking rule: acquire this prior to tasklist_lock. - */ -static DEFINE_SPINLOCK(task_capability_lock); - -/* - * Leveraged for setting/resetting capabilities - */ - -const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; - -EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); - -/* - * More recent versions of libcap are available from: - * - * http://www.kernel.org/pub/linux/libs/security/linux-privs/ - */ - -static void warn_legacy_capability_use(void) -{ - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version 2 capabilities worked fine, but the linux/capability.h file - * that accompanied their introduction encouraged their use without - * the necessary user-space source code changes. As such, we have - * created a version 3 with equivalent functionality to version 2, but - * with a header change to protect legacy source code from using - * version 2 when it wanted to use version 1. If your system has code - * that trips the following warning, it is using version 2 specific - * capabilities and may be doing so insecurely. - * - * The remedy is to either upgrade your version of libcap (to 2.10+, - * if the application is linked against it), or recompile your - * application with modern kernel headers and this warning will go - * away. - */ - -static void warn_deprecated_v2(void) -{ - static int warned; - - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version check. Return the number of u32s in each capability flag - * array, or a negative value on error. - */ -static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) -{ - __u32 version; - - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - *tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ - case _LINUX_CAPABILITY_VERSION_3: - *tocopy = _LINUX_CAPABILITY_U32S_3; - break; - default: - if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } - - return 0; -} - -#ifndef CONFIG_SECURITY_FILE_CAPABILITIES - -/* - * Without filesystem capability support, we nominally support one process - * setting the capabilities of another - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - struct task_struct *target; - int ret; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = security_capget(target, pEp, pIp, pPp); - -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -/* - * cap_set_pg - set capabilities for all processes in a given process - * group. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - struct pid *pgrp; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - pgrp = find_vpid(pgrp_nr); - do_each_pid_task(pgrp, PIDTYPE_PGID, g) { - target = g; - while_each_thread(g, target) { - if (!security_capset_check(target, effective, - inheritable, permitted)) { - security_capset_set(target, effective, - inheritable, permitted); - ret = 0; - } - found = 1; - } - } while_each_pid_task(pgrp, PIDTYPE_PGID, g); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - return ret; -} - -/* - * cap_set_all - set capabilities for all processes other than init - * and self. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_all(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - do_each_thread(g, target) { - if (target == current - || is_container_init(target->group_leader)) - continue; - found = 1; - if (security_capset_check(target, effective, inheritable, - permitted)) - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - if (!found) - ret = 0; - - return ret; -} - -/* - * Given the target pid does not refer to the current process we - * need more elaborate support... (This support is not present when - * filesystem capabilities are configured.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *target; - int ret; - - if (!capable(CAP_SETPCAP)) - return -EPERM; - - if (pid == -1) /* all procs other than current and init */ - return cap_set_all(effective, inheritable, permitted); - - else if (pid < 0) /* all procs in process group */ - return cap_set_pg(-pid, effective, inheritable, permitted); - - /* target != current */ - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else { - ret = security_capset_check(target, effective, inheritable, - permitted); - - /* having verified that the proposed changes are legal, - we now put them into effect. */ - if (!ret) - security_capset_set(target, effective, inheritable, - permitted); - } - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - - return ret; -} - -#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */ - -/* - * If we have configured with filesystem capability support, then the - * only thing that can change the capabilities of the current process - * is the current process. As such, we can't be in this code at the - * same time as we are in the process of setting capabilities in this - * process. The net result is that we can limit our use of locks to - * when we are reading the caps of another process. - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - int ret; - - if (pid && (pid != task_pid_vnr(current))) { - struct task_struct *target; - - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else - ret = security_capget(target, pEp, pIp, pPp); - - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); - } else - ret = security_capget(current, pEp, pIp, pPp); - - return ret; -} - -/* - * With filesystem capability support configured, the kernel does not - * permit the changing of capabilities in one process by another - * process. (CAP_SETPCAP has much less broad semantics when configured - * this way.) - */ -static inline int do_sys_capset_other_tasks(pid_t pid, - kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - return -EPERM; -} - -#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ - -/* - * Atomically modify the effective capabilities returning the original - * value. No permission check is performed here - it is assumed that the - * caller is permitted to set the desired effective capabilities. - */ -kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) -{ - kernel_cap_t pE_old; - - spin_lock(&task_capability_lock); - - pE_old = current->cap_effective; - current->cap_effective = pE_new; - - spin_unlock(&task_capability_lock); - - return pE_old; -} - -EXPORT_SYMBOL(cap_set_effective); - -/** - * sys_capget - get the capabilities of a given process. - * @header: pointer to struct that contains capability version and - * target pid data - * @dataptr: pointer to struct that contains the effective, permitted, - * and inheritable capabilities that are returned - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) -{ - int ret = 0; - pid_t pid; - unsigned tocopy; - kernel_cap_t pE, pI, pP; - - ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - if (pid < 0) - return -EINVAL; - - ret = cap_get_target_pid(pid, &pE, &pI, &pP); - - if (!ret) { - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i; - - for (i = 0; i < tocopy; i++) { - kdata[i].effective = pE.cap[i]; - kdata[i].permitted = pP.cap[i]; - kdata[i].inheritable = pI.cap[i]; - } - - /* - * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, - * we silently drop the upper capabilities here. This - * has the effect of making older libcap - * implementations implicitly drop upper capability - * bits when they perform a: capget/modify/capset - * sequence. - * - * This behavior is considered fail-safe - * behavior. Upgrading the application to a newer - * version of libcap will enable access to the newer - * capabilities. - * - * An alternative would be to return an error here - * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts - * before modification is attempted and the application - * fails. - */ - if (copy_to_user(dataptr, kdata, tocopy - * sizeof(struct __user_cap_data_struct))) { - return -EFAULT; - } - } - - return ret; -} - -/** - * sys_capset - set capabilities for a process or (*) a group of processes - * @header: pointer to struct that contains capability version and - * target pid data - * @data: pointer to struct that contains the effective, permitted, - * and inheritable capabilities - * - * Set capabilities for a given process, all processes, or all - * processes in a given process group. - * - * The restrictions on setting capabilities are specified as: - * - * [pid is for the 'target' task. 'current' is the calling task.] - * - * I: any raised capabilities must be a subset of the (old current) permitted - * P: any raised capabilities must be a subset of the (old current) permitted - * E: must be set to a subset of (new target) permitted - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) -{ - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; - kernel_cap_t inheritable, permitted, effective; - int ret; - pid_t pid; - - ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - if (copy_from_user(&kdata, data, tocopy - * sizeof(struct __user_cap_data_struct))) { - return -EFAULT; - } - - for (i = 0; i < tocopy; i++) { - effective.cap[i] = kdata[i].effective; - permitted.cap[i] = kdata[i].permitted; - inheritable.cap[i] = kdata[i].inheritable; - } - while (i < _KERNEL_CAPABILITY_U32S) { - effective.cap[i] = 0; - permitted.cap[i] = 0; - inheritable.cap[i] = 0; - i++; - } - - if (pid && (pid != task_pid_vnr(current))) - ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, - &permitted); - else { - /* - * This lock is required even when filesystem - * capability support is configured - it protects the - * sys_capget() call from returning incorrect data in - * the case that the targeted process is not the - * current one. - */ - spin_lock(&task_capability_lock); - - ret = security_capset_check(current, &effective, &inheritable, - &permitted); - /* - * Having verified that the proposed changes are - * legal, we now put them into effect. - */ - if (!ret) - security_capset_set(current, &effective, &inheritable, - &permitted); - spin_unlock(&task_capability_lock); - } - - - return ret; -} - -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -int capable(int cap) -{ - if (has_capability(current, cap)) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} -EXPORT_SYMBOL(capable); -/* - * Generic process-grouping system. - * - * Based originally on the cpuset system, extracted by Paul Menage - * Copyright (C) 2006 Google, Inc - * - * Copyright notices from the original cpuset code: - * -------------------------------------------------- - * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004-2006 Silicon Graphics, Inc. - * - * Portions derived from Patrick Mochel's sysfs code. - * sysfs is Copyright (c) 2001-3 Patrick Mochel - * - * 2003-10-10 Written by Simon Derr. - * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson. - * --------------------------------------------------- - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of the Linux - * distribution for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static DEFINE_MUTEX(cgroup_mutex); - -/* Generate an array of cgroup subsystem pointers */ -#define SUBSYS(_x) &_x ## _subsys, - -static struct cgroup_subsys *subsys[] = { -#include -}; - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_bits; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_bits; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the mounted hierarchies */ - struct list_head root_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; -}; - - -/* - * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the - * subsystems that are otherwise unattached - it never has more than a - * single cgroup, and all tasks are part of that cgroup. - */ -static struct cgroupfs_root rootnode; - -/* The list of hierarchy roots */ - -static LIST_HEAD(roots); -static int root_count; - -/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ -#define dummytop (&rootnode.top_cgroup) - -/* This flag indicates whether tasks in the fork and exit paths should - * check for fork/exit handlers to call. This avoids us having to do - * extra work in the fork/exit path if none of the subsystems need to - * be called. - */ -static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; - -/* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) -{ - return test_bit(CGRP_REMOVED, &cgrp->flags); -} - -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ -}; - -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ - const int bits = - (1 << CGRP_RELEASABLE) | - (1 << CGRP_NOTIFY_ON_RELEASE); - return (cgrp->flags & bits) == bits; -} - -static int notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - -/* - * for_each_subsys() allows you to iterate on each subsystem attached to - * an active hierarchy - */ -#define for_each_subsys(_root, _ss) \ -list_for_each_entry(_ss, &_root->subsys_list, sibling) - -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ -list_for_each_entry(_root, &roots, root_list) - -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); -static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent); -static void check_for_release(struct cgroup *cgrp); - -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { - /* - * List running through cg_cgroup_links associated with a - * cgroup, anchored on cgroup->css_sets - */ - struct list_head cgrp_link_list; - /* - * List running through cg_cgroup_links pointing at a - * single css_set object, anchored on css_set->cg_links - */ - struct list_head cg_link_list; - struct css_set *cg; -}; - -/* The default css_set - used by init and its children prior to any - * hierarchies being mounted. It contains a pointer to the root state - * for each subsystem. Also used to anchor the list of css_sets. Not - * reference-counted, to improve performance when child cgroups - * haven't been created. - */ - -static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; - -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ -static DEFINE_RWLOCK(css_set_lock); -static int css_set_count; - -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ -#define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; - -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) -{ - int i; - int index; - unsigned long tmp = 0UL; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; - - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; -} - -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ -static int use_task_css_set_links __read_mostly; - -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - write_lock(&css_set_lock); - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } - - write_unlock(&css_set_lock); -} - -static void __release_css_set(struct kref *k, int taskexit) -{ - int i; - struct css_set *cg = container_of(k, struct css_set, ref); - - unlink_css_set(cg); - - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - } - rcu_read_unlock(); - kfree(cg); -} - -static void release_css_set(struct kref *k) -{ - __release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ - __release_css_set(k, 1); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cg) -{ - kref_get(&cg->ref); -} - -static inline void put_css_set(struct css_set *cg) -{ - kref_put(&cg->ref, release_css_set); -} - -static inline void put_css_set_taskexit(struct css_set *cg) -{ - kref_put(&cg->ref, release_css_set_taskexit); -} - -/* - * find_existing_css_set() is a helper for - * find_css_set(), and checks to see whether an existing - * css_set is suitable. - * - * oldcg: the cgroup group that we're using before the cgroup - * transition - * - * cgrp: the cgroup that we're moving into - * - * template: location in which to build the desired set of subsystem - * state objects for the new cgroup group - */ -static struct css_set *find_existing_css_set( - struct css_set *oldcg, - struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) -{ - int i; - struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; - struct hlist_node *node; - struct css_set *cg; - - /* Built the set of subsystem state objects that we want to - * see in the new css_set */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgrp->subsys[i]; - } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ - template[i] = oldcg->subsys[i]; - } - } - - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } - } - - /* No existing cgroup group matched */ - return NULL; -} - -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error - */ -static int allocate_cg_links(int count, struct list_head *tmp) -{ - struct cg_cgroup_link *link; - int i; - INIT_LIST_HEAD(tmp); - for (i = 0; i < count; i++) { - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) { - free_cg_links(tmp); - return -ENOMEM; - } - list_add(&link->cgrp_link_list, tmp); - } - return 0; -} - -/* - * find_css_set() takes an existing cgroup group and a - * cgroup object, and returns a css_set object that's - * equivalent to the old group, but with the given cgroup - * substituted into the appropriate hierarchy. Must be called with - * cgroup_mutex held - */ -static struct css_set *find_css_set( - struct css_set *oldcg, struct cgroup *cgrp) -{ - struct css_set *res; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; - - struct list_head tmp_cg_links; - struct cg_cgroup_link *link; - - struct hlist_head *hhead; - - /* First see if we already have a cgroup group that matches - * the desired set */ - read_lock(&css_set_lock); - res = find_existing_css_set(oldcg, cgrp, template); - if (res) - get_css_set(res); - read_unlock(&css_set_lock); - - if (res) - return res; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return NULL; - - /* Allocate all the cg_cgroup_link objects that we'll need */ - if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { - kfree(res); - return NULL; - } - - kref_init(&res->ref); - INIT_LIST_HEAD(&res->cg_links); - INIT_LIST_HEAD(&res->tasks); - INIT_HLIST_NODE(&res->hlist); - - /* Copy the set of subsystem state objects generated in - * find_existing_css_set() */ - memcpy(res->subsys, template, sizeof(res->subsys)); - - write_lock(&css_set_lock); - /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - - BUG_ON(!list_empty(&tmp_cg_links)); - - css_set_count++; - - /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); - - write_unlock(&css_set_lock); - - return res; -} - -/* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * - * A task must hold cgroup_mutex to modify cgroups. - * - * Any task can increment and decrement the count field without lock. - * So in general, code holding cgroup_mutex can't rely on the count - * field not changing. However, if the count goes to zero, then only - * cgroup_attach_task() can increment it again. Because a count of zero - * means that no tasks are currently attached, therefore there is no - * way a task attached to that cgroup can fork (the other way to - * increment the count). So code holding cgroup_mutex can safely - * assume that if the count is zero, it will stay zero. Similarly, if - * a task holds cgroup_mutex on a cgroup with zero count, it - * knows that the cgroup won't be removed, as cgroup_rmdir() - * needs that mutex. - * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits. Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - * - * A cgroup can only be deleted if both its 'count' of using tasks - * is zero, and its list of 'children' cgroups is empty. Since all - * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup - * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. - * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. - * - * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cgroup pointer by cgroup_attach_task() - */ - -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} - -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; - -static struct backing_dev_info cgroup_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; -} - -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static void cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) - ss->pre_destroy(ss, cgrp); - return; -} - -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); - - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } - - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* Drop the active superblock reference that we took when we - * created the cgroup */ - deactivate_super(cgrp->root->sb); - - kfree(cgrp); - } - iput(inode); -} - -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); -} - -static void cgroup_clear_directory(struct dentry *dentry) -{ - struct list_head *node; - - BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dcache_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - list_del_init(node); - if (d->d_inode) { - /* This should never be called on a cgroup - * directory with child cgroups */ - BUG_ON(d->d_inode->i_mode & S_IFDIR); - d = dget_locked(d); - spin_unlock(&dcache_lock); - d_delete(d); - simple_unlink(dentry->d_inode, d); - dput(d); - spin_lock(&dcache_lock); - } - node = dentry->d_subdirs.next; - } - spin_unlock(&dcache_lock); -} - -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - cgroup_clear_directory(dentry); - - spin_lock(&dcache_lock); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dcache_lock); - remove_dir(dentry); -} - -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_bits) -{ - unsigned long added_bits, removed_bits; - struct cgroup *cgrp = &root->top_cgroup; - int i; - - removed_bits = root->actual_subsys_bits & ~final_bits; - added_bits = final_bits & ~root->actual_subsys_bits; - /* Check that any added subsystems are currently free */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) - continue; - if (ss->root != &rootnode) { - /* Subsystem isn't free */ - return -EBUSY; - } - } - - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (!list_empty(&cgrp->children)) - return -EBUSY; - - /* Process each subsystem */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - unsigned long bit = 1UL << i; - if (bit & added_bits) { - /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!dummytop->subsys[i]); - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - cgrp->subsys[i] = dummytop->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); - if (ss->bind) - ss->bind(ss, cgrp); - - } else if (bit & removed_bits) { - /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - if (ss->bind) - ss->bind(ss, dummytop); - dummytop->subsys[i]->cgroup = dummytop; - cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); - list_del(&ss->sibling); - } else if (bit & final_bits) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); - } - } - root->subsys_bits = root->actual_subsys_bits = final_bits; - synchronize_rcu(); - - return 0; -} - -static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) -{ - struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; - struct cgroup_subsys *ss; - - mutex_lock(&cgroup_mutex); - for_each_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) - seq_puts(seq, ",noprefix"); - if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); - mutex_unlock(&cgroup_mutex); - return 0; -} - -struct cgroup_sb_opts { - unsigned long subsys_bits; - unsigned long flags; - char *release_agent; -}; - -/* Convert a hierarchy specifier into a bitmask of subsystems and - * flags. */ -static int parse_cgroupfs_options(char *data, - struct cgroup_sb_opts *opts) -{ - char *token, *o = data ?: "all"; - - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; - - while ((token = strsep(&o, ",")) != NULL) { - if (!*token) - return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - int i; - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; - } else { - struct cgroup_subsys *ss; - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - } - - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) - return -EINVAL; - - return 0; -} - -static int cgroup_remount(struct super_block *sb, int *flags, char *data) -{ - int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgroup_sb_opts opts; - - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* Don't allow flags to change at remount */ - if (opts.flags != root->flags) { - ret = -EINVAL; - goto out_unlock; - } - - ret = rebind_subsystems(root, opts.subsys_bits); - - /* (re)populate subsystem files */ - if (!ret) - cgroup_populate_dir(cgrp); - - if (opts.release_agent) - strcpy(root->release_agent_path, opts.release_agent); - out_unlock: - if (opts.release_agent) - kfree(opts.release_agent); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - return ret; -} - -static struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; - -static void init_cgroup_root(struct cgroupfs_root *root) -{ - struct cgroup *cgrp = &root->top_cgroup; - INIT_LIST_HEAD(&root->subsys_list); - INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; - cgrp->root = root; - cgrp->top_cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); -} - -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroupfs_root *new = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; - - /* Next check flags */ - if (new->flags != root->flags) - return 0; - - return 1; -} - -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroupfs_root *root = data; - - ret = set_anon_super(sb, NULL); - if (ret) - return ret; - - sb->s_fs_info = root; - root->sb = sb; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; - - return 0; -} - -static int cgroup_get_rootdir(struct super_block *sb) -{ - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; - - if (!inode) - return -ENOMEM; - - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); - return -ENOMEM; - } - sb->s_root = dentry; - return 0; -} - -static int cgroup_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) -{ - struct cgroup_sb_opts opts; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) { - if (opts.release_agent) - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - if (opts.release_agent) - kfree(opts.release_agent); - return -ENOMEM; - } - - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); - } - - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - - if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); - } - - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; - struct inode *inode; - int i; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cg_links(css_set_count, &tmp_cg_links); - if (ret) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - - ret = rebind_subsystems(root, root->subsys_bits); - if (ret == -EBUSY) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - - /* EBUSY should be the only error here */ - BUG_ON(ret); - - list_add(&root->root_list, &roots); - root_count++; - - sb->s_root->d_fsdata = &root->top_cgroup; - root->top_cgroup.dentry = sb->s_root; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } - } - write_unlock(&css_set_lock); - - free_cg_links(&tmp_cg_links); - - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - cgroup_populate_dir(cgrp); - mutex_unlock(&inode->i_mutex); - mutex_unlock(&cgroup_mutex); - } - - return simple_set_mnt(mnt, sb); - - drop_new_super: - up_write(&sb->s_umount); - deactivate_super(sb); - free_cg_links(&tmp_cg_links); - return ret; -} - -static void cgroup_kill_sb(struct super_block *sb) { - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - int ret; - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); - - mutex_lock(&cgroup_mutex); - - /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - - /* - * Release all the links from css_sets to this hierarchy's - * root cgroup - */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, - cgrp_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } - write_unlock(&css_set_lock); - - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } - mutex_unlock(&cgroup_mutex); - - kfree(root); - kill_litter_super(sb); -} - -static struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .get_sb = cgroup_get_sb, - .kill_sb = cgroup_kill_sb, -}; - -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -/** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - char *start; - - if (cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); - return 0; - } - - start = buf + buflen; - - *--start = '\0'; - for (;;) { - int len = cgrp->dentry->d_name.len; - if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, cgrp->dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; - if (!cgrp->parent) - continue; - if (--start < buf) - return -ENAMETOOLONG; - *start = '/'; - } - memmove(buf, start, buf + buflen - start); - return 0; -} - -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - -/** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call holding cgroup_mutex. May take task_lock of - * the task 'tsk' during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss; - struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; - struct css_set *newcg; - struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); - if (cgrp == oldcgrp) - return 0; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); - if (retval) - return retval; - } - } - - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - return -ESRCH; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } - write_unlock(&css_set_lock); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); - } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - synchronize_rcu(); - put_css_set(cg); - return 0; -} - -/* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task - */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) -{ - struct task_struct *tsk; - int ret; - - if (pid) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(tsk); - rcu_read_unlock(); - - if ((current->euid) && (current->euid != tsk->uid) - && (current->euid != tsk->suid)) { - put_task_struct(tsk); - return -EACCES; - } - } else { - tsk = current; - get_task_struct(tsk); - } - - ret = cgroup_attach_task(cgrp, tsk); - put_task_struct(tsk); - return ret; -} - -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) -{ - int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); - return ret; -} - -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} - -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) -{ - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - strcpy(cgrp->root->release_agent_path, buffer); - cgroup_unlock(); - return 0; -} - -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) -{ - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - seq_puts(seq, cgrp->root->release_agent_path); - seq_putc(seq, '\n'); - cgroup_unlock(); - return 0; -} - -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); - } else { - s64 val = simple_strtoll(buffer, &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); - } - if (!retval) - retval = nbytes; - return retval; -} - -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; - - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; - } - - buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; -} - -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (!cft || cgroup_is_removed(cgrp)) - return -ENODEV; - if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); - return ret ? ret : nbytes; - } - return -EINVAL; -} - -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (!cft || cgroup_is_removed(cgrp)) - return -ENODEV; - - if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); - return -EINVAL; -} - -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) -{ - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); -} - -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(state->cgroup, cft, &cb); - } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); -} - -static struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = cgroup_seqfile_release, -}; - -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - int err; - struct cftype *cft; - - err = generic_file_open(inode, file); - if (err) - return err; - - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; - if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state = - kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) - err = cft->open(inode, file); - else - err = 0; - - return err; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cftype *cft = __d_cft(file->f_dentry); - if (cft->release) - return cft->release(inode, file); - return 0; -} - -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) - return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) - return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); -} - -static struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, -}; - -static int cgroup_create_file(struct dentry *dentry, int mode, - struct super_block *sb) -{ - static struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - }; - - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - } - dentry->d_op = &cgroup_dops; - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} - -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - int mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - cgrp->dentry = dentry; - dget(dentry); - } - dput(dentry); - - return error; -} - -int cgroup_add_file(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype *cft) -{ - struct dentry *dir = cgrp->dentry; - struct dentry *dentry; - int error; - - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { - strcpy(name, subsys->name); - strcat(name, "."); - } - strcat(name, cft->name); - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - dentry = lookup_one_len(name, dir, strlen(name)); - if (!IS_ERR(dentry)) { - error = cgroup_create_file(dentry, 0644 | S_IFREG, - cgrp->root->sb); - if (!error) - dentry->d_fsdata = (void *)cft; - dput(dentry); - } else - error = PTR_ERR(dentry); - return error; -} - -int cgroup_add_files(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype cft[], - int count) -{ - int i, err; - for (i = 0; i < count; i++) { - err = cgroup_add_file(cgrp, subsys, &cft[i]); - if (err) - return err; - } - return 0; -} - -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cg_cgroup_link *link; - - read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->ref.refcount); - } - read_unlock(&css_set_lock); - return count; -} - -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) -{ - struct list_head *l = it->cg_link; - struct cg_cgroup_link *link; - struct css_set *cg; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->css_sets) { - it->cg_link = NULL; - return; - } - link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cg = link->cg; - } while (list_empty(&cg->tasks)); - it->cg_link = l; - it->task = cg->tasks.next; -} - -/* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - */ - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &p->cgroups->tasks); - task_unlock(p); - } while_each_thread(g, p); - write_unlock(&css_set_lock); -} - -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) -{ - /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - - read_lock(&css_set_lock); - it->cg_link = &cgrp->css_sets; - cgroup_advance_iter(cgrp, it); -} - -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) -{ - struct task_struct *res; - struct list_head *l = it->task; - - /* If the iterator cg is NULL, we have no tasks */ - if (!it->cg_link) - return NULL; - res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ - l = l->next; - if (l == &res->cgroups->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); - } else { - it->task = l; - } - return res; -} - -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) -{ - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan - * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. - * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). - */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) -{ - int retval, i; - struct cgroup_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct ptr_heap *heap; - struct timespec latest_time = { 0, 0 }; - - if (scan->heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (scan->test_task && !scan->test_task(p, scan)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - cgroup_iter_end(scan->cg, &it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - scan->process_task(q, scan); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - -/* - * Stuff for reading the 'tasks' file. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). - */ -struct ctr_struct { - char *buf; - int bufsz; -}; - -/* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. - */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) -{ - int n = 0; - struct cgroup_iter it; - struct task_struct *tsk; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) - break; - pidarray[n++] = task_pid_vnr(tsk); - } - cgroup_iter_end(cgrp, &it); - return n; -} - -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) -{ - int ret = -EINVAL; - struct cgroup *cgrp; - struct cgroup_iter it; - struct task_struct *tsk; - - /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. - */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; - rcu_read_lock(); - - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - cgroup_iter_end(cgrp, &it); - - rcu_read_unlock(); -err: - return ret; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -/* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. - */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) -{ - int cnt = 0; - int i; - - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; -} - -/* - * Handle an open on 'tasks' file. Prepare a buffer listing the - * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any. - */ -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct ctr_struct *ctr; - pid_t *pidarray; - int npids; - char c; - - if (!(file->f_mode & FMODE_READ)) - return 0; - - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - if (npids) { - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - } else { - ctr->buf = NULL; - ctr->bufsz = 0; - } - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} - -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; - - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} - -static int cgroup_tasks_release(struct inode *unused_inode, - struct file *file) -{ - struct ctr_struct *ctr; - - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); - } - return 0; -} - -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) -{ - return notify_on_release(cgrp); -} - -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) -{ - clear_bit(CGRP_RELEASABLE, &cgrp->flags); - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - return 0; -} - -/* - * for the common functions, 'private' gives the type of file - */ -static struct cftype files[] = { - { - .name = "tasks", - .open = cgroup_tasks_open, - .read = cgroup_tasks_read, - .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, - }, - - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, - }, -}; - -static struct cftype cft_release_agent = { - .name = "release_agent", - .read_seq_string = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, -}; - -static int cgroup_populate_dir(struct cgroup *cgrp) -{ - int err; - struct cgroup_subsys *ss; - - /* First clear out any existing files */ - cgroup_clear_directory(cgrp->dentry); - - err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); - if (err < 0) - return err; - - if (cgrp == cgrp->top_cgroup) { - if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) - return err; - } - - for_each_subsys(cgrp->root, ss) { - if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) - return err; - } - - return 0; -} - -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) -{ - css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); - css->flags = 0; - if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); - BUG_ON(cgrp->subsys[ss->subsys_id]); - cgrp->subsys[ss->subsys_id] = css; -} - -/* - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held - */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - int mode) -{ - struct cgroup *cgrp; - struct cgroupfs_root *root = parent->root; - int err = 0; - struct cgroup_subsys *ss; - struct super_block *sb = root->sb; - - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); - - mutex_lock(&cgroup_mutex); - - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); - - cgrp->parent = parent; - cgrp->root = parent->root; - cgrp->top_cgroup = parent->top_cgroup; - - if (notify_on_release(parent)) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - - for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_destroy; - } - init_cgroup_css(css, ss, cgrp); - } - - list_add(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - err = cgroup_create_dir(cgrp, dentry, mode); - if (err < 0) - goto err_remove; - - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); - - err = cgroup_populate_dir(cgrp); - /* If err < 0, we have a half-filled directory - oh well ;) */ - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - - return 0; - - err_remove: - - list_del(&cgrp->sibling); - root->number_of_cgroups--; - - err_destroy: - - for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } - - mutex_unlock(&cgroup_mutex); - - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); - - kfree(cgrp); - return err; -} - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct cgroup *c_parent = dentry->d_parent->d_fsdata; - - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); -} - -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ - /* Check the reference count on each subsystem. Since we - * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should - * be no outstanding references, so the subsystem is safe to - * destroy. We scan across all subsystems rather than using - * the per-hierarchy linked list of mounted subsystems since - * we can be called via check_for_release() with no - * synchronization other than RCU, and the subsystem linked - * list isn't RCU-safe */ - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - struct cgroup_subsys_state *css; - /* Skip subsystems not in this hierarchy */ - if (ss->root != cgrp->root) - continue; - css = cgrp->subsys[ss->subsys_id]; - /* When called from check_for_release() it's possible - * that by this point the cgroup has been removed - * and the css deleted. But a false-positive doesn't - * matter, since it can only happen if the cgroup - * has been deleted and hence no longer needs the - * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) - return 1; - } - return 0; -} - -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -{ - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; - - /* the vfs holds both inode->i_mutex already */ - - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - - parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; - - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - cgroup_call_pre_destroy(cgrp); - - if (cgroup_has_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - - spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); - if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); - spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ - list_del(&cgrp->sibling); - spin_lock(&cgrp->dentry->d_lock); - d = dget(cgrp->dentry); - spin_unlock(&d->d_lock); - - cgroup_d_remove_dir(d); - dput(d); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - - /* Create the top cgroup state for this subsystem */ - ss->root = &rootnode; - css = ss->create(ss, dummytop); - /* We don't handle early failures gracefully */ - BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, dummytop); - - /* Update the init_css_set to contain a subsys - * pointer to this state - since the subsystem is - * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; - - need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; - - /* At system boot, before all subsystems have been - * registered, no tasks have been forked, so we don't - * need to invoke fork callbacks here. */ - BUG_ON(!list_empty(&init_task.tasks)); - - ss->active = 1; -} - -/** - * cgroup_init_early - cgroup initialization at system boot - * - * Initialize cgroups at system boot, and initialize any - * subsystems that request early init. - */ -int __init cgroup_init_early(void) -{ - int i; - kref_init(&init_css_set.ref); - kref_get(&init_css_set.ref); - INIT_LIST_HEAD(&init_css_set.cg_links); - INIT_LIST_HEAD(&init_css_set.tasks); - INIT_HLIST_NODE(&init_css_set.hlist); - css_set_count = 1; - init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); - root_count = 1; - init_task.cgroups = &init_css_set; - - init_css_set_link.cg = &init_css_set; - list_add(&init_css_set_link.cgrp_link_list, - &rootnode.top_cgroup.css_sets); - list_add(&init_css_set_link.cg_link_list, - &init_css_set.cg_links); - - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } - - if (ss->early_init) - cgroup_init_subsys(ss); - } - return 0; -} - -/** - * cgroup_init - cgroup initialization - * - * Register cgroup filesystem and /proc file, and initialize - * any subsystems that didn't request early init. - */ -int __init cgroup_init(void) -{ - int err; - int i; - struct hlist_head *hhead; - - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (!ss->early_init) - cgroup_init_subsys(ss); - } - - /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); - - err = register_filesystem(&cgroup_fs_type); - if (err < 0) - goto out; - - proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; -} - -/* - * proc_cgroup_show() - * - Print task's cgroup paths into seq_file, one line for each hierarchy - * - Used for /proc//cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. - */ - -/* TODO: Use a proper seq_file iterator */ -static int proc_cgroup_show(struct seq_file *m, void *v) -{ - struct pid *pid; - struct task_struct *tsk; - char *buf; - int retval; - struct cgroupfs_root *root; - - retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) - goto out; - - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = 0; - - mutex_lock(&cgroup_mutex); - - for_each_root(root) { - struct cgroup_subsys *ss; - struct cgroup *cgrp; - int subsys_id; - int count = 0; - - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; - seq_printf(m, "%lu:", root->subsys_bits); - for_each_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); - seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) - goto out_unlock; - seq_puts(m, buf); - seq_putc(m, '\n'); - } - -out_unlock: - mutex_unlock(&cgroup_mutex); - put_task_struct(tsk); -out_free: - kfree(buf); -out: - return retval; -} - -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - mutex_lock(&cgroup_mutex); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, - ss->root->number_of_cgroups, !ss->disabled); - } - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -static struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/** - * cgroup_fork - attach newly forked task to its parents cgroup. - * @child: pointer to task_struct of forking parent process. - * - * Description: A task inherits its parent's cgroup at fork(). - * - * A pointer to the shared css_set was automatically copied in - * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. - * - * At the point that cgroup_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. - */ -void cgroup_fork(struct task_struct *child) -{ - task_lock(current); - child->cgroups = current->cgroups; - get_css_set(child->cgroups); - task_unlock(current); - INIT_LIST_HEAD(&child->cg_list); -} - -/** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->fork) - ss->fork(ss, child); - } - } -} - -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); - } - } -} -#endif /* CONFIG_MM_OWNER */ - -/** - * cgroup_post_fork - called on a new task after adding it to the task list - * @child: the task in question - * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. - */ -void cgroup_post_fork(struct task_struct *child) -{ - if (use_task_css_set_links) { - write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &child->cgroups->tasks); - write_unlock(&css_set_lock); - } -} -/** - * cgroup_exit - detach cgroup from exiting task - * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? - * - * Description: Detach cgroup from @tsk and release it. - * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. - * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. - */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) -{ - int i; - struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } - - /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock - */ - if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); - write_unlock(&css_set_lock); - } - - /* Reassign the task to the init_css_set. */ - task_lock(tsk); - cg = tsk->cgroups; - tsk->cgroups = &init_css_set; - task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); -} - -/** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - printk(KERN_INFO - "Not cloning cgroup for unused subsystem %s\n", - subsys->name); - mutex_unlock(&cgroup_mutex); - return 0; - } - cg = tsk->cgroups; - parent = task_cgroup(tsk, subsys->subsys_id); - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - get_css_set(cg); - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(parent->root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(parent->root->sb); - return ret; -} - -/** - * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp - * @cgrp: the cgroup in question - * - * See if @cgrp is a descendant of the current task's cgroup in - * the appropriate hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp) -{ - int ret; - struct cgroup *target; - int subsys_id; - - if (cgrp == dummytop) - return 1; - - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(current, subsys_id); - while (cgrp != target && cgrp!= cgrp->top_cgroup) - cgrp = cgrp->parent; - ret = (cgrp == target); - return ret; -} - -static void check_for_release(struct cgroup *cgrp) -{ - /* All of these checks rely on RCU to keep the cgroup - * structure alive */ - if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { - /* Control Group is currently removeable. If it's not - * already queued for a userspace notification, queue - * it now */ - int need_schedule_work = 0; - spin_lock(&release_list_lock); - if (!cgroup_is_removed(cgrp) && - list_empty(&cgrp->release_list)) { - list_add(&cgrp->release_list, &release_list); - need_schedule_work = 1; - } - spin_unlock(&release_list_lock); - if (need_schedule_work) - schedule_work(&release_agent_work); - } -} - -void __css_put(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - rcu_read_unlock(); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -static void cgroup_release_agent(struct work_struct *work) -{ - BUG_ON(work != &release_agent_work); - mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); - while (!list_empty(&release_list)) { - char *argv[3], *envp[3]; - int i; - char *pathbuf = NULL, *agentbuf = NULL; - struct cgroup *cgrp = list_entry(release_list.next, - struct cgroup, - release_list); - list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!pathbuf) - goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) - goto continue_free; - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!agentbuf) - goto continue_free; - - i = 0; - argv[i++] = agentbuf; - argv[i++] = pathbuf; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - /* Drop the lock while we invoke the usermode helper, - * since the exec could involve hitting disk and hence - * be a slow process */ - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - mutex_lock(&cgroup_mutex); - continue_free: - kfree(pathbuf); - kfree(agentbuf); - spin_lock(&release_list_lock); - } - spin_unlock(&release_list_lock); - mutex_unlock(&cgroup_mutex); -} - -static int __init cgroup_disable(char *str) -{ - int i; - char *token; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - if (!strcmp(token, ss->name)) { - ss->disabled = 1; - printk(KERN_INFO "Disabling %s control group" - " subsystem\n", ss->name); - break; - } - } - } - return 1; -} -__setup("cgroup_disable=", cgroup_disable); -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include -#include -#include -#include - -#include - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - cgroup_lock(); - count = cgroup_task_count(cont); - cgroup_unlock(); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->ref.refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - } -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; -/* - * linux/kernel/compat.c - * - * Kernel compatibililty routines for e.g. 32 bit syscall support - * on 64 bit kernels. - * - * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include /* for MAX_SCHEDULE_TIMEOUT */ -#include -#include -#include -#include -#include -#include -#include - -#include - -int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) -{ - return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) -{ - return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static long compat_nanosleep_restart(struct restart_block *restart) -{ - struct compat_timespec __user *rmtp; - struct timespec rmt; - mm_segment_t oldfs; - long ret; - - restart->nanosleep.rmtp = (struct timespec __user *) &rmt; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep_restart(restart); - set_fs(oldfs); - - if (ret) { - rmtp = restart->nanosleep.compat_rmtp; - - if (rmtp && put_compat_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) -{ - struct timespec tu, rmt; - mm_segment_t oldfs; - long ret; - - if (get_compat_timespec(&tu, rqtp)) - return -EFAULT; - - if (!timespec_valid(&tu)) - return -EINVAL; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu, - rmtp ? (struct timespec __user *)&rmt : NULL, - HRTIMER_MODE_REL, CLOCK_MONOTONIC); - set_fs(oldfs); - - if (ret) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; - - restart->fn = compat_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - - if (rmtp && put_compat_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -static inline long get_compat_itimerval(struct itimerval *o, - struct compat_itimerval __user *i) -{ - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | - __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | - __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | - __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); -} - -static inline long put_compat_itimerval(struct compat_itimerval __user *o, - struct itimerval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | - __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | - __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | - __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); -} - -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) -{ - struct itimerval kit; - int error; - - error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) - error = -EFAULT; - return error; -} - -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) -{ - struct itimerval kin, kout; - int error; - - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - if (put_compat_itimerval(out, &kout)) - return -EFAULT; - return 0; -} - -asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) -{ - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ - if (tbuf) { - struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); - if (copy_to_user(tbuf, &tmp, sizeof(tmp))) - return -EFAULT; - } - return compat_jiffies_to_clock_t(jiffies); -} - -/* - * Assumption: old_sigset_t and compat_old_sigset_t are both - * types that can be passed to put_user()/get_user(). - */ - -asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sigpending((old_sigset_t __user *) &s); - set_fs(old_fs); - if (ret == 0) - ret = put_user(s, set); - return ret; -} - -asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, - compat_old_sigset_t __user *oset) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs; - - if (set && get_user(s, set)) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_sigprocmask(how, - set ? (old_sigset_t __user *) &s : NULL, - oset ? (old_sigset_t __user *) &s : NULL); - set_fs(old_fs); - if (ret == 0) - if (oset) - ret = put_user(s, oset); - return ret; -} - -asmlinkage long compat_sys_setrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs (); - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - - if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || - __get_user(r.rlim_cur, &rlim->rlim_cur) || - __get_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - - if (r.rlim_cur == COMPAT_RLIM_INFINITY) - r.rlim_cur = RLIM_INFINITY; - if (r.rlim_max == COMPAT_RLIM_INFINITY) - r.rlim_max = RLIM_INFINITY; - set_fs(KERNEL_DS); - ret = sys_setrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); - return ret; -} - -#ifdef COMPAT_RLIM_OLD_INFINITY - -asmlinkage long compat_sys_old_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, &r); - set_fs(old_fs); - - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -#endif - -asmlinkage long compat_sys_getrlimit (unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) -{ - if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || - __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || - __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || - __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || - __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || - __put_user(r->ru_maxrss, &ru->ru_maxrss) || - __put_user(r->ru_ixrss, &ru->ru_ixrss) || - __put_user(r->ru_idrss, &ru->ru_idrss) || - __put_user(r->ru_isrss, &ru->ru_isrss) || - __put_user(r->ru_minflt, &ru->ru_minflt) || - __put_user(r->ru_majflt, &ru->ru_majflt) || - __put_user(r->ru_nswap, &ru->ru_nswap) || - __put_user(r->ru_inblock, &ru->ru_inblock) || - __put_user(r->ru_oublock, &ru->ru_oublock) || - __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || - __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || - __put_user(r->ru_nsignals, &ru->ru_nsignals) || - __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || - __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) - return -EFAULT; - return 0; -} - -asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) -{ - struct rusage r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrusage(who, (struct rusage __user *) &r); - set_fs(old_fs); - - if (ret) - return ret; - - if (put_compat_rusage(&r, ru)) - return -EFAULT; - - return 0; -} - -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) -{ - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; - } -} - -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) -{ - siginfo_t info; - struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); - - set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - ret = put_compat_rusage(&ru, uru); - if (ret) - return ret; - } - - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(uinfo, &info); -} - -static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, cpumask_t *new_mask) -{ - unsigned long *k; - - if (len < sizeof(cpumask_t)) - memset(new_mask, 0, sizeof(cpumask_t)); - else if (len > sizeof(cpumask_t)) - len = sizeof(cpumask_t); - - k = cpus_addr(*new_mask); - return compat_get_bitmap(k, user_mask_ptr, len * 8); -} - -asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, - unsigned int len, - compat_ulong_t __user *user_mask_ptr) -{ - cpumask_t new_mask; - int retval; - - retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; - - return sched_setaffinity(pid, &new_mask); -} - -asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, - compat_ulong_t __user *user_mask_ptr) -{ - int ret; - cpumask_t mask; - unsigned long *k; - unsigned int min_length = sizeof(cpumask_t); - - if (NR_CPUS <= BITS_PER_COMPAT_LONG) - min_length = sizeof(compat_ulong_t); - - if (len < min_length) - return -EINVAL; - - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; - - k = cpus_addr(mask); - ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret) - return ret; - - return min_length; -} - -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (get_compat_timespec(&dst->it_interval, &src->it_interval) || - get_compat_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (put_compat_timespec(&src->it_interval, &dst->it_interval) || - put_compat_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - -long compat_sys_timer_create(clockid_t which_clock, - struct compat_sigevent __user *timer_event_spec, - timer_t __user *created_timer_id) -{ - struct sigevent __user *event = NULL; - - if (timer_event_spec) { - struct sigevent kevent; - - event = compat_alloc_user_space(sizeof(*event)); - if (get_compat_sigevent(&kevent, timer_event_spec) || - copy_to_user(event, &kevent, sizeof(*event))) - return -EFAULT; - } - - return sys_timer_create(which_clock, event, created_timer_id); -} - -long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old) -{ - long err; - mm_segment_t oldfs; - struct itimerspec newts, oldts; - - if (!new) - return -EINVAL; - if (get_compat_itimerspec(&newts, new)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_settime(timer_id, flags, - (struct itimerspec __user *) &newts, - (struct itimerspec __user *) &oldts); - set_fs(oldfs); - if (!err && old && put_compat_itimerspec(old, &oldts)) - return -EFAULT; - return err; -} - -long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting) -{ - long err; - mm_segment_t oldfs; - struct itimerspec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_gettime(timer_id, - (struct itimerspec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_itimerspec(setting, &ts)) - return -EFAULT; - return err; -} - -long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - if (get_compat_timespec(&ts, tp)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_settime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - return err; -} - -long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_gettime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_getres(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && tp && put_compat_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -static long compat_clock_nanosleep_restart(struct restart_block *restart) -{ - long err; - mm_segment_t oldfs; - struct timespec tu; - struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; - - restart->nanosleep.rmtp = (struct timespec __user *) &tu; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = clock_nanosleep_restart(restart); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&tu, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) -{ - long err; - mm_segment_t oldfs; - struct timespec in, out; - struct restart_block *restart; - - if (get_compat_timespec(&in, rqtp)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_nanosleep(which_clock, flags, - (struct timespec __user *) &in, - (struct timespec __user *) &out); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&out, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -/* - * We currently only need the following fields from the sigevent - * structure: sigev_value, sigev_signo, sig_notify and (sometimes - * sigev_notify_thread_id). The others are handled in user mode. - * We also assume that copying sigev_value.sival_int is sufficient - * to keep all the bits of sigev_value.sival_ptr intact. - */ -int get_compat_sigevent(struct sigevent *event, - const struct compat_sigevent __user *u_event) -{ - memset(event, 0, sizeof(*event)); - return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || - __get_user(event->sigev_value.sival_int, - &u_event->sigev_value.sival_int) || - __get_user(event->sigev_signo, &u_event->sigev_signo) || - __get_user(event->sigev_notify, &u_event->sigev_notify) || - __get_user(event->sigev_notify_thread_id, - &u_event->sigev_notify_thread_id)) - ? -EFAULT : 0; -} - -long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, - unsigned long bitmap_size) -{ - int i, j; - unsigned long m; - compat_ulong_t um; - unsigned long nr_compat_longs; - - /* align bitmap up to nearest compat_long_t boundary */ - bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); - - if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) - return -EFAULT; - - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = 0; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - /* - * We dont want to read past the end of the userspace - * bitmap. We must however ensure the end of the - * kernel bitmap is zeroed. - */ - if (nr_compat_longs-- > 0) { - if (__get_user(um, umask)) - return -EFAULT; - } else { - um = 0; - } - - umask++; - m |= (long)um << (j * BITS_PER_COMPAT_LONG); - } - *mask++ = m; - } - - return 0; -} - -long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, - unsigned long bitmap_size) -{ - int i, j; - unsigned long m; - compat_ulong_t um; - unsigned long nr_compat_longs; - - /* align bitmap up to nearest compat_long_t boundary */ - bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); - - if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) - return -EFAULT; - - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = *mask++; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - um = m; - - /* - * We dont want to write past the end of the userspace - * bitmap. - */ - if (nr_compat_longs-- > 0) { - if (__put_user(um, umask)) - return -EFAULT; - } - - umask++; - m >>= 4*sizeof(um); - m >>= 4*sizeof(um); - } - } - - return 0; -} - -void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) -{ - switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); - } -} - -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) -{ - compat_sigset_t s32; - sigset_t s; - int sig; - struct timespec t; - siginfo_t info; - long ret, timeout = 0; - - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&s, &s32); - sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&s); - - if (uts) { - if (get_compat_timespec (&t, uts)) - return -EFAULT; - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 - || t.tv_sec < 0) - return -EINVAL; - } - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = timespec_to_jiffies(&t) - +(t.tv_sec || t.tv_nsec); - if (timeout) { - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &s); - - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); - - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - }else { - ret = timeout?-EINTR:-EAGAIN; - } - return ret; - -} - -#ifdef __ARCH_WANT_COMPAT_SYS_TIME - -/* compat_time_t is a 32 bit "long" and needs to get converted. */ - -asmlinkage long compat_sys_time(compat_time_t __user * tloc) -{ - compat_time_t i; - struct timeval tv; - - do_gettimeofday(&tv); - i = tv.tv_sec; - - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} - -asmlinkage long compat_sys_stime(compat_time_t __user *tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ - -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - -asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) -{ - struct timex txc; - int ret; - - memset(&txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; - - ret = do_adjtimex(&txc); - - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; - - return ret; -} - -#ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} - -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, - const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} -#endif - -struct compat_sysinfo { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - u16 procs; - u16 pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(u32)-sizeof(int)]; -}; - -asmlinkage long -compat_sys_sysinfo(struct compat_sysinfo __user *info) -{ - struct sysinfo s; - - do_sysinfo(&s); - - /* Check to see if any memory value is too large for 32-bit and scale - * down if needed - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - int bitcount = 0; - - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - - s.totalram >>= bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || - __put_user (s.uptime, &info->uptime) || - __put_user (s.loads[0], &info->loads[0]) || - __put_user (s.loads[1], &info->loads[1]) || - __put_user (s.loads[2], &info->loads[2]) || - __put_user (s.totalram, &info->totalram) || - __put_user (s.freeram, &info->freeram) || - __put_user (s.sharedram, &info->sharedram) || - __put_user (s.bufferram, &info->bufferram) || - __put_user (s.totalswap, &info->totalswap) || - __put_user (s.freeswap, &info->freeswap) || - __put_user (s.procs, &info->procs) || - __put_user (s.totalhigh, &info->totalhigh) || - __put_user (s.freehigh, &info->freehigh) || - __put_user (s.mem_unit, &info->mem_unit)) - return -EFAULT; - - return 0; -} - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -/* - * kernel/configs.c - * Echo the kernel .config file used to build the kernel - * - * Copyright (C) 2002 Khalid Aziz - * Copyright (C) 2002 Randy Dunlap - * Copyright (C) 2002 Al Stone - * Copyright (C) 2002 Hewlett-Packard Company - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include - -/**************************************************/ -/* the actual current config file */ - -/* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * - * IKCFG_ED - */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) - -#ifdef CONFIG_IKCONFIG_PROC - -/**************************************************/ -/* globals and useful constants */ - -static ssize_t -ikconfig_read_current(struct file *file, char __user *buf, - size_t len, loff_t * offset) -{ - return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); -} - -static const struct file_operations ikconfig_file_ops = { - .owner = THIS_MODULE, - .read = ikconfig_read_current, -}; - -/***************************************************/ -/* ikconfig_init: start up everything we need to */ - -static int __init ikconfig_init(void) -{ - struct proc_dir_entry *entry; - - /* create the current config file */ - entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, - &ikconfig_file_ops); - if (!entry) - return -ENOMEM; - - entry->size = kernel_config_data_size; - - return 0; -} - -/***************************************************/ -/* ikconfig_cleanup: clean up our mess */ - -static void __exit ikconfig_cleanup(void) -{ - remove_proc_entry("config.gz", NULL); -} - -module_init(ikconfig_init); -module_exit(ikconfig_cleanup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Randy Dunlap"); -MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); - -#endif /* CONFIG_IKCONFIG_PROC */ -/* CPU control. - * (C) 2001, 2002, 2003, 2004 Rusty Russell - * - * This code is licenced under the GPL. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP - -/* - * Represents all cpu's that are currently online. - */ -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); - -#else /* CONFIG_SMP */ - -/* Serializes the updates to cpu_online_map, cpu_present_map */ -static DEFINE_MUTEX(cpu_add_remove_lock); - -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); - -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. - * Should always be manipulated under cpu_add_remove_lock - */ -static int cpu_hotplug_disabled; - -static struct { - struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - int refcount; -} cpu_hotplug; - -void __init cpu_hotplug_init(void) -{ - cpu_hotplug.active_writer = NULL; - mutex_init(&cpu_hotplug.lock); - cpu_hotplug.refcount = 0; -} - -cpumask_t cpu_active_map; - -#ifdef CONFIG_HOTPLUG_CPU - -void get_online_cpus(void) -{ - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.refcount++; - mutex_unlock(&cpu_hotplug.lock); - -} -EXPORT_SYMBOL_GPL(get_online_cpus); - -void put_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return; - mutex_lock(&cpu_hotplug.lock); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); - -} -EXPORT_SYMBOL_GPL(put_online_cpus); - -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_map, cpu_present_map. - */ -void cpu_maps_update_begin(void) -{ - mutex_lock(&cpu_add_remove_lock); -} - -void cpu_maps_update_done(void) -{ - mutex_unlock(&cpu_add_remove_lock); -} - -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ -static void cpu_hotplug_begin(void) -{ - cpu_hotplug.active_writer = current; - - for (;;) { - mutex_lock(&cpu_hotplug.lock); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } -} - -static void cpu_hotplug_done(void) -{ - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); -} -/* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) -{ - int ret; - cpu_maps_update_begin(); - ret = raw_notifier_chain_register(&cpu_chain, nb); - cpu_maps_update_done(); - return ret; -} - -#ifdef CONFIG_HOTPLUG_CPU - -EXPORT_SYMBOL(register_cpu_notifier); - -void __ref unregister_cpu_notifier(struct notifier_block *nb) -{ - cpu_maps_update_begin(); - raw_notifier_chain_unregister(&cpu_chain, nb); - cpu_maps_update_done(); -} -EXPORT_SYMBOL(unregister_cpu_notifier); - -static inline void check_for_tasks(int cpu) -{ - struct task_struct *p; - - write_lock_irq(&tasklist_lock); - for_each_process(p) { - if (task_cpu(p) == cpu && - (!cputime_eq(p->utime, cputime_zero) || - !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); - } - write_unlock_irq(&tasklist_lock); -} - -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; - -/* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) -{ - struct take_cpu_down_param *param = _param; - int err; - - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); - /* Ensure this CPU doesn't handle any more interrupts. */ - err = __cpu_disable(); - if (err < 0) - return err; - - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); - return 0; -} - -/* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) -{ - int err, nr_calls = 0; - cpumask_t old_allowed, tmp; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, - hcpu, -1, &nr_calls); - if (err == NOTIFY_BAD) { - nr_calls--; - __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - err = -EINVAL; - goto out_release; - } - - /* Ensure that we are not runnable on dying cpu */ - old_allowed = current->cpus_allowed; - cpus_setall(tmp); - cpu_clear(cpu, tmp); - set_cpus_allowed_ptr(current, &tmp); - tmp = cpumask_of_cpu(cpu); - - err = __stop_machine(take_cpu_down, &tcd_param, &tmp); - if (err) { - /* CPU didn't die: tell everyone. Can't complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu) == NOTIFY_BAD) - BUG(); - - goto out_allowed; - } - BUG_ON(cpu_online(cpu)); - - /* Wait for it to sleep (leaving idle task). */ - while (!idle_cpu(cpu)) - yield(); - - /* This actually kills the CPU. */ - __cpu_die(cpu); - - /* CPU is completely dead: tell everyone. Too late to complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); - - check_for_tasks(cpu); - -out_allowed: - set_cpus_allowed_ptr(current, &old_allowed); -out_release: - cpu_hotplug_done(); - if (!err) { - if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); - } - return err; -} - -int __ref cpu_down(unsigned int cpu) -{ - int err = 0; - - cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - cpu_clear(cpu, cpu_active_map); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_map. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - - err = _cpu_down(cpu, 0); - - if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); - -out: - cpu_maps_update_done(); - return err; -} -EXPORT_SYMBOL(cpu_down); -#endif /*CONFIG_HOTPLUG_CPU*/ - -/* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) -{ - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - - if (cpu_online(cpu) || !cpu_present(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, - -1, &nr_calls); - if (ret == NOTIFY_BAD) { - nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - ret = -EINVAL; - goto out_notify; - } - - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu); - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); - - cpu_set(cpu, cpu_active_map); - - /* Now call notifier in preparation. */ - raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); - -out_notify: - if (ret != 0) - __raw_notifier_call_chain(&cpu_chain, - CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); - cpu_hotplug_done(); - - return ret; -} - -int __cpuinit cpu_up(unsigned int cpu) -{ - int err = 0; - if (!cpu_isset(cpu, cpu_possible_map)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); -#endif - return -EINVAL; - } - - cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_up(cpu, 0); - -out: - cpu_maps_update_done(); - return err; -} - -#ifdef CONFIG_PM_SLEEP_SMP -static cpumask_t frozen_cpus; - -int disable_nonboot_cpus(void) -{ - int cpu, first_cpu, error = 0; - - cpu_maps_update_begin(); - first_cpu = first_cpu(cpu_online_map); - /* We take down all of the non-boot CPUs in one shot to avoid races - * with the userspace trying to use the CPU hotplug at the same time - */ - cpus_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - error = _cpu_down(cpu, 1); - if (!error) { - cpu_set(cpu, frozen_cpus); - printk("CPU%d is down\n", cpu); - } else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); - break; - } - } - if (!error) { - BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); - } - cpu_maps_update_done(); - return error; -} - -void __ref enable_nonboot_cpus(void) -{ - int cpu, error; - - /* Allow everyone to use the CPU hotplug again */ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - if (cpus_empty(frozen_cpus)) - goto out; - - printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask_nr(cpu, frozen_cpus) { - error = _cpu_up(cpu, 1); - if (!error) { - printk("CPU%d is up\n", cpu); - continue; - } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); - } - cpus_clear(frozen_cpus); -out: - cpu_maps_update_done(); -} -#endif /* CONFIG_PM_SLEEP_SMP */ - -#endif /* CONFIG_SMP */ - -/* - * cpu_bit_bitmap[] is a special, "compressed" data structure that - * represents all NR_CPUS bits binary values of 1< 32 - MASK_DECLARE_8(32), MASK_DECLARE_8(40), - MASK_DECLARE_8(48), MASK_DECLARE_8(56), -#endif -}; -EXPORT_SYMBOL_GPL(cpu_bit_bitmap); -/* - * kernel/cpuset.c - * - * Processor and Memory placement constraints for sets of tasks. - * - * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004-2007 Silicon Graphics, Inc. - * Copyright (C) 2006 Google, Inc - * - * Portions derived from Patrick Mochel's sysfs code. - * sysfs is Copyright (c) 2001-3 Patrick Mochel - * - * 2003-10-10 Written by Simon Derr. - * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson. - * 2006 Rework by Paul Menage to use generic cgroups - * 2008 Rework of the scheduler domains and CPU hotplug handling - * by Max Krasnyansky - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of the Linux - * distribution for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; - -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; -struct cpuset; - -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - - struct cpuset *parent; /* my parent */ - - /* - * Copy of global cpuset_mems_generation as of the most - * recent time this cpuset changed its mems_allowed. - */ - int mems_generation; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* used for walking a cpuset heirarchy */ - struct list_head stack_list; -}; - -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cont) -{ - return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), - struct cpuset, css); -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return container_of(task_subsys_state(task, cpuset_subsys_id), - struct cpuset, css); -} -struct cpuset_hotplug_scanner { - struct cgroup_scanner scan; - struct cgroup *to; -}; - -/* bits in struct cpuset flags field */ -typedef enum { - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - -/* - * Increment this integer everytime any cpuset changes its - * mems_allowed value. Users of cpusets can track this generation - * number, and avoid having to lock and reload mems_allowed unless - * the cpuset they're using changes generation. - * - * A single, global generation is needed because cpuset_attach_task() could - * reattach a task to a different cpuset, which must not have its - * generation numbers aliased with those of that tasks previous cpuset. - * - * Generations are needed for mems_allowed because one task cannot - * modify another's memory placement. So we must enable every task, - * on every visit to __alloc_pages(), to efficiently check whether - * its current->cpuset->mems_allowed has changed, requiring an update - * of its current->mems_allowed. - * - * Since writes to cpuset_mems_generation are guarded by the cgroup lock - * there is no need to mark it atomic. - */ -static int cpuset_mems_generation; - -static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .cpus_allowed = CPU_MASK_ALL, - .mems_allowed = NODE_MASK_ALL, -}; - -/* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_mutex, then it has read-only - * access to cpusets. - * - * The task_struct fields mems_allowed and mems_generation may only - * be accessed in the context of that task, so require no locks. - * - * The cpuset_common_file_read() handlers only hold callback_mutex across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c - */ - -static DEFINE_MUTEX(callback_mutex); - -/* - * This is ugly, but preserves the userspace API for existing cpuset - * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead - */ -static int cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - int ret = -ENODEV; - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->get_sb(cgroup_fs, flags, - unused_dev_name, mountopts, mnt); - put_filesystem(cgroup_fs); - } - return ret; -} - -static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .get_sb = cpuset_get_sb, -}; - -/* - * Return in *pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. - * - * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. - * - * Call with callback_mutex held. - */ - -static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) -{ - while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) - cs = cs->parent; - if (cs) - cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); - else - *pmask = cpu_online_map; - BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); -} - -/* - * Return in *pmask the portion of a cpusets's mems_allowed that - * are online, with memory. If none are online with memory, walk - * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_HIGH_MEMORY]. - * - * One way or another, we guarantee to return some non-empty subset - * of node_states[N_HIGH_MEMORY]. - * - * Call with callback_mutex held. - */ - -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) -{ - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_HIGH_MEMORY])) - cs = cs->parent; - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_HIGH_MEMORY]); - else - *pmask = node_states[N_HIGH_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); -} - -/** - * cpuset_update_task_memory_state - update task memory placement - * - * If the current tasks cpusets mems_allowed changed behind our - * backs, update current->mems_allowed, mems_generation and task NUMA - * mempolicy to the new value. - * - * Task mempolicy is updated by rebinding it relative to the - * current->cpuset if a task has its memory placement changed. - * Do not call this routine if in_interrupt(). - * - * Call without callback_mutex or task_lock() held. May be - * called with or without cgroup_mutex held. Thanks in part to - * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex during - * call. - * - * Reading current->cpuset->mems_generation doesn't need task_lock - * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset using RCU. - * - * The rcu_dereference() is technically probably not needed, - * as I don't actually mind if I see a new cpuset pointer but - * an old value of mems_generation. However this really only - * matters on alpha systems using cpusets heavily. If I dropped - * that rcu_dereference(), it would save them a memory barrier. - * For all other arch's, rcu_dereference is a no-op anyway, and for - * alpha systems not using cpusets, another planned optimization, - * avoiding the rcu critical section for tasks in the root cpuset - * which is statically allocated, so can't vanish, will make this - * irrelevant. Better to use RCU as intended, than to engage in - * some cute trick to save a memory barrier that is impossible to - * test, for alpha systems using cpusets heavily, which might not - * even exist. - * - * This routine is needed to update the per-task mems_allowed data, - * within the tasks context, when it is trying to allocate memory - * (in various mm/mempolicy.c routines) and notices that some other - * task has been modifying its cpuset. - */ - -void cpuset_update_task_memory_state(void) -{ - int my_cpusets_mem_gen; - struct task_struct *tsk = current; - struct cpuset *cs; - - if (task_cs(tsk) == &top_cpuset) { - /* Don't need rcu for top_cpuset. It's never freed. */ - my_cpusets_mem_gen = top_cpuset.mems_generation; - } else { - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - } - - if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - mutex_lock(&callback_mutex); - task_lock(tsk); - cs = task_cs(tsk); /* Maybe changed when task not locked */ - guarantee_online_mems(cs, &tsk->mems_allowed); - tsk->cpuset_mems_generation = cs->mems_generation; - if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; - else - tsk->flags &= ~PF_SPREAD_PAGE; - if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; - else - tsk->flags &= ~PF_SPREAD_SLAB; - task_unlock(tsk); - mutex_unlock(&callback_mutex); - mpol_rebind_task(tsk, &tsk->mems_allowed); - } -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpus_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - -/* - * validate_change() - Used to validate that any proposed cpuset change - * follows the structural rules for cpusets. - * - * If we replaced the flag and mask values of the current cpuset - * (cur) with those values in the trial cpuset (trial), would - * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. - * - * 'cur' is the address of an actual, in-use cpuset. Operations - * such as list traversal that depend on the actual address of the - * cpuset in the list must use cur below, not trial. - * - * 'trial' is the address of bulk structure copy of cur, with - * perhaps one or more of the fields cpus_allowed, mems_allowed, - * or flags changed to new, trial values. - * - * Return 0 if valid, -errno if not. - */ - -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) -{ - struct cgroup *cont; - struct cpuset *c, *par; - - /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } - - /* Remaining checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; - - par = cur->parent; - - /* We must be a subset of our parent cpuset */ - if (!is_cpuset_subset(trial, par)) - return -EACCES; - - /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; - } - - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpus_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } - - return 0; -} - -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpus_intersects(a->cpus_allowed, b->cpus_allowed); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) -{ - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpus_empty(cp->cpus_allowed)) - continue; - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } -} - -/* - * generate_sched_domains() - * - * This function builds a partial partition of the systems CPUs - * A 'partial partition' is a set of non-overlapping subsets whose - * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c - * partition_sched_domains() routine, which will rebuild the scheduler's - * load balancing domains (sched domains) as specified by that partial - * partition. - * - * See "What is sched_load_balance" in Documentation/cpusets.txt - * for a background explanation of this. - * - * Does not return errors, on the theory that the callers of this - * routine would rather not worry about failures to rebuild sched - * domains when operating in the severe memory shortage situations - * that could cause allocation failures below. - * - * Must be called with cgroup_lock held. - * - * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. - * csa - (for CpuSet Array) Array of pointers to all the cpusets - * that need to be load balanced, for convenient iterative - * access by the subsequent code that finds the best partition, - * i.e the set of domains (subsets) of CPUs such that the - * cpus_allowed of every cpuset marked is_sched_load_balance - * is a subset of one of these domains, while there are as - * many such domains as possible, each as small as possible. - * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a - * convenient format, that can be easily compared to the prior - * value to determine what partition elements (sched domains) - * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. - * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). - */ -static int generate_sched_domains(cpumask_t **domains, - struct sched_domain_attr **attributes) -{ - LIST_HEAD(q); /* queue of cpusets to be scanned */ - struct cpuset *cp; /* scans q */ - struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ - cpumask_t *doms; /* resulting partition; i.e. sched domains */ - struct sched_domain_attr *dattr; /* attributes for custom domains */ - int ndoms; /* number of sched domains in result */ - int nslot; /* next empty doms[] cpumask_t slot */ - - doms = NULL; - dattr = NULL; - csa = NULL; - - /* Special case for the 99% of systems with one, full, sched domain */ - if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - *doms = top_cpuset.cpus_allowed; - - ndoms = 1; - goto done; - } - - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); - if (!csa) - goto done; - csn = 0; - - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpus_empty(cp->cpus_allowed)) - continue; - - /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. - */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; - continue; - } - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } - - for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; - -restart: - /* Find the best partition (set of sched domains) */ - for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; - } - } - } - - /* - * Now we know how many domains to create. - * Convert to and populate cpu masks. - */ - doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto done; - - /* - * The rest of the code, including the scheduler, can deal with - * dattr==NULL case. No need to abort if alloc fails. - */ - dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); - - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - cpumask_t *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; - } - - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; - } - } - nslot++; - } - BUG_ON(nslot != ndoms); - -done: - kfree(csa); - - /* - * Fallback to the default domain if kmalloc() failed. - * See comments in partition_sched_domains(). - */ - if (doms == NULL) - ndoms = 1; - - *domains = doms; - *attributes = dattr; - return ndoms; -} - -/* - * Rebuild scheduler domains. - * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). - * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. - */ -static void do_rebuild_sched_domains(struct work_struct *unused) -{ - struct sched_domain_attr *attr; - cpumask_t *doms; - int ndoms; - - get_online_cpus(); - - /* Generate domain masks and attrs */ - cgroup_lock(); - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); - - put_online_cpus(); -} - -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - schedule_work(&rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ -void rebuild_sched_domains(void) -{ - do_rebuild_sched_domains(NULL); -} - -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Call with cgroup_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). - */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - return !cpus_equal(tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); -} - -/** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task - * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); -} - -/** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. - * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() - * - * Called with cgroup_mutex held - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. - */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) -{ - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); -} - -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, const char *buf) -{ - struct ptr_heap heap; - struct cpuset trialcs; - int retval; - int is_load_balanced; - - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; - - trialcs = *cs; - - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpus_clear(trialcs.cpus_allowed); - } else { - retval = cpulist_parse(buf, trialcs.cpus_allowed); - if (retval < 0) - return retval; - - if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) - return -EINVAL; - } - retval = validate_change(cs, &trialcs); - if (retval < 0) - return retval; - - /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) - return 0; - - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - - is_load_balanced = is_sched_load_balance(&trialcs); - - mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs.cpus_allowed; - mutex_unlock(&callback_mutex); - - /* - * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. - */ - update_tasks_cpumask(cs, &heap); - - heap_free(&heap); - - if (is_load_balanced) - async_rebuild_sched_domains(); - return 0; -} - -/* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * Call holding cgroup_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. - * - * We call cpuset_update_task_memory_state() before hacking - * our tasks mems_allowed, so that we are assured of being in - * sync with our tasks cpuset, and in particular, callbacks to - * cpuset_update_task_memory_state() from nested page allocations - * won't see any mismatch of our cpuset and task mems_generation - * values, so won't overwrite our hacked tasks mems_allowed - * nodemask. - */ - -static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, - const nodemask_t *to) -{ - struct task_struct *tsk = current; - - cpuset_update_task_memory_state(); - - mutex_lock(&callback_mutex); - tsk->mems_allowed = *to; - mutex_unlock(&callback_mutex); - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); - - mutex_lock(&callback_mutex); - guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); - mutex_unlock(&callback_mutex); -} - -static void *cpuset_being_rebound; - -/** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. - * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @oldmem: old mems_allowed of cpuset cs - * - * Called with cgroup_mutex held - * Return 0 if successful, -errno if not. - */ -static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) -{ - struct task_struct *p; - struct mm_struct **mmarray; - int i, n, ntasks; - int migrate; - int fudge; - struct cgroup_iter it; - int retval; - - cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - - fudge = 10; /* spare mmarray[] slots */ - fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ - retval = -ENOMEM; - - /* - * Allocate mmarray[] to hold mm reference for each task - * in cpuset cs. Can't kmalloc GFP_KERNEL while holding - * tasklist_lock. We could use GFP_ATOMIC, but with a - * few more lines of code, we can retry until we get a big - * enough mmarray[] w/o using GFP_ATOMIC. - */ - while (1) { - ntasks = cgroup_task_count(cs->css.cgroup); /* guess */ - ntasks += fudge; - mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); - if (!mmarray) - goto done; - read_lock(&tasklist_lock); /* block fork */ - if (cgroup_task_count(cs->css.cgroup) <= ntasks) - break; /* got enough */ - read_unlock(&tasklist_lock); /* try again */ - kfree(mmarray); - } - - n = 0; - - /* Load up mmarray[] with mm reference for each task in cpuset. */ - cgroup_iter_start(cs->css.cgroup, &it); - while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { - struct mm_struct *mm; - - if (n >= ntasks) { - printk(KERN_WARNING - "Cpuset mempolicy rebind incomplete.\n"); - break; - } - mm = get_task_mm(p); - if (!mm) - continue; - mmarray[n++] = mm; - } - cgroup_iter_end(cs->css.cgroup, &it); - read_unlock(&tasklist_lock); - - /* - * Now that we've dropped the tasklist spinlock, we can - * rebind the vma mempolicies of each mm in mmarray[] to their - * new cpuset, and release that mm. The mpol_rebind_mm() - * call takes mmap_sem, which we couldn't take while holding - * tasklist_lock. Forks can happen again now - the mpol_dup() - * cpuset_being_rebound check will catch such forks, and rebind - * their vma mempolicies too. Because we still hold the global - * cgroup_mutex, we know that no other rebind effort will - * be contending for the global variable cpuset_being_rebound. - * It's ok if we rebind the same mm twice; mpol_rebind_mm() - * is idempotent. Also migrate pages in each mm to new nodes. - */ - migrate = is_memory_migrate(cs); - for (i = 0; i < n; i++) { - struct mm_struct *mm = mmarray[i]; - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); - mmput(mm); - } - - /* We're done rebinding vmas to this cpuset's new mems_allowed. */ - kfree(mmarray); - cpuset_being_rebound = NULL; - retval = 0; -done: - return retval; -} - -/* - * Handle user request to change the 'mems' memory placement - * of a cpuset. Needs to validate the request, update the - * cpusets mems_allowed and mems_generation, and for each - * task in the cpuset, rebind any vma mempolicies and if - * the cpuset is marked 'memory_migrate', migrate the tasks - * pages to the new memory. - * - * Call with cgroup_mutex held. May take callback_mutex during call. - * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind - * their mempolicies to the cpusets new mems_allowed. - */ -static int update_nodemask(struct cpuset *cs, const char *buf) -{ - struct cpuset trialcs; - nodemask_t oldmem; - int retval; - - /* - * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) - return -EACCES; - - trialcs = *cs; - - /* - * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. - */ - if (!*buf) { - nodes_clear(trialcs.mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); - if (retval < 0) - goto done; - - if (!nodes_subset(trialcs.mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; - } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } - retval = validate_change(cs, &trialcs); - if (retval < 0) - goto done; - - mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; - cs->mems_generation = cpuset_mems_generation++; - mutex_unlock(&callback_mutex); - - retval = update_tasks_nodemask(cs, &oldmem); -done: - return retval; -} - -int current_cpuset_is_being_rebound(void) -{ - return task_cs(current) == cpuset_being_rebound; -} - -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ - if (val < -1 || val >= SD_LV_MAX) - return -EINVAL; - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); - } - - return 0; -} - -/* - * update_flag - read a 0 or a 1 in a file and update associated flag - * bit: the bit to update (see cpuset_flagbits_t) - * cs: the cpuset to update - * turning_on: whether the flag is being set or cleared - * - * Call with cgroup_mutex held. - */ - -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on) -{ - struct cpuset trialcs; - int err; - int cpus_nonempty, balance_flag_changed; - - trialcs = *cs; - if (turning_on) - set_bit(bit, &trialcs.flags); - else - clear_bit(bit, &trialcs.flags); - - err = validate_change(cs, &trialcs); - if (err < 0) - return err; - - cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); - balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(&trialcs)); - - mutex_lock(&callback_mutex); - cs->flags = trialcs.flags; - mutex_unlock(&callback_mutex); - - if (cpus_nonempty && balance_flag_changed) - async_rebuild_sched_domains(); - - return 0; -} - -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time_t now = get_seconds(); - time_t ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) -{ - struct cpuset *cs = cgroup_cs(cont); - - if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; - if (tsk->flags & PF_THREAD_BOUND) { - cpumask_t mask; - - mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; - mutex_unlock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, mask)) - return -EINVAL; - } - - return security_task_setscheduler(tsk, 0, NULL); -} - -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) -{ - cpumask_t cpus; - nodemask_t from, to; - struct mm_struct *mm; - struct cpuset *cs = cgroup_cs(cont); - struct cpuset *oldcs = cgroup_cs(oldcont); - int err; - - mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); - err = set_cpus_allowed_ptr(tsk, &cpus); - mutex_unlock(&callback_mutex); - if (err) - return; - - from = oldcs->mems_allowed; - to = cs->mems_allowed; - mm = get_task_mm(tsk); - if (mm) { - mpol_rebind_mm(mm, &to); - if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &from, &to); - mmput(mm); - } - -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - int retval = 0; - struct cpuset *cs = cgroup_cs(cgrp); - cpuset_filetype_t type = cft->private; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - cs->mems_generation = cpuset_mems_generation++; - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - cs->mems_generation = cpuset_mems_generation++; - break; - default: - retval = -EINVAL; - break; - } - cgroup_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) -{ - int retval = 0; - struct cpuset *cs = cgroup_cs(cgrp); - cpuset_filetype_t type = cft->private; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } - cgroup_unlock(); - return retval; -} - -/* - * Common handling for a write to a "cpus" or "mems" file. - */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - int retval = 0; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - switch (cft->private) { - case FILE_CPULIST: - retval = update_cpumask(cgroup_cs(cgrp), buf); - break; - case FILE_MEMLIST: - retval = update_nodemask(cgroup_cs(cgrp), buf); - break; - default: - retval = -EINVAL; - break; - } - cgroup_unlock(); - return retval; -} - -/* - * These ascii lists should be read in a single call, by using a user - * buffer large enough to hold the entire map. If read in smaller - * chunks, there is no guarantee of atomicity. Since the display format - * used, list of ranges of sequential numbers, is variable length, - * and since these maps can change value dynamically, one could read - * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. - */ - -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) -{ - cpumask_t mask; - - mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; - mutex_unlock(&callback_mutex); - - return cpulist_scnprintf(page, PAGE_SIZE, mask); -} - -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - nodemask_t mask; - - mutex_lock(&callback_mutex); - mask = cs->mems_allowed; - mutex_unlock(&callback_mutex); - - return nodelist_scnprintf(page, PAGE_SIZE, mask); -} - -static ssize_t cpuset_common_file_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; - - switch (type) { - case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); - break; - case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); - break; - default: - retval = -EINVAL; - goto out; - } - *s++ = '\n'; - - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; -} - -static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unrechable but makes gcc happy */ - return 0; -} - - -/* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype files[] = { - { - .name = "cpus", - .read = cpuset_common_file_read, - .write_string = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .read = cpuset_common_file_read, - .write_string = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, -}; - -static struct cftype cft_memory_pressure_enabled = { - .name = "memory_pressure_enabled", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, -}; - -static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - int err; - - err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); - if (err) - return err; - /* memory_pressure_enabled is in root cpuset only */ - if (!cont->parent) - err = cgroup_add_file(cont, ss, - &cft_memory_pressure_enabled); - return err; -} - -/* - * post_clone() is called at the end of cgroup_clone(). - * 'cgroup' was just created automatically as a result of - * a cgroup_clone(), and the current task is about to - * be moved into 'cgroup'. - * - * Currently we refuse to set up the cgroup - thereby - * refusing the task to be entered, and as a result refusing - * the sys_unshare() or clone() which initiated it - if any - * sibling cpusets have exclusive cpus or mem. - * - * If this becomes a problem for some users who wish to - * allow that scenario, then cpuset_post_clone() could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. Called with cgroup_mutex - * held. - */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct cgroup *parent, *child; - struct cpuset *cs, *parent_cs; - - parent = cgroup->parent; - list_for_each_entry(child, &parent->children, sibling) { - cs = cgroup_cs(child); - if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) - return; - } - cs = cgroup_cs(cgroup); - parent_cs = cgroup_cs(parent); - - cs->mems_allowed = parent_cs->mems_allowed; - cs->cpus_allowed = parent_cs->cpus_allowed; - return; -} - -/* - * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem - * cont: control group that the new cpuset will be part of - */ - -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cpuset *cs; - struct cpuset *parent; - - if (!cont->parent) { - /* This is early initialization for the top cgroup */ - top_cpuset.mems_generation = cpuset_mems_generation++; - return &top_cpuset.css; - } - parent = cgroup_cs(cont->parent); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) - return ERR_PTR(-ENOMEM); - - cpuset_update_task_memory_state(); - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpus_clear(cs->cpus_allowed); - nodes_clear(cs->mems_allowed); - cs->mems_generation = cpuset_mems_generation++; - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; - - cs->parent = parent; - number_of_cpusets++; - return &cs->css ; -} - -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). - */ - -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - struct cpuset *cs = cgroup_cs(cont); - - cpuset_update_task_memory_state(); - - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; - kfree(cs); -} - -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", - .create = cpuset_create, - .destroy = cpuset_destroy, - .can_attach = cpuset_can_attach, - .attach = cpuset_attach, - .populate = cpuset_populate, - .post_clone = cpuset_post_clone, - .subsys_id = cpuset_subsys_id, - .early_init = 1, -}; - -/* - * cpuset_init_early - just enough so that the calls to - * cpuset_update_task_memory_state() in early init code - * are harmless. - */ - -int __init cpuset_init_early(void) -{ - top_cpuset.mems_generation = cpuset_mems_generation++; - return 0; -} - - -/** - * cpuset_init - initialize cpusets at system boot - * - * Description: Initialize top_cpuset and the cpuset internal file system, - **/ - -int __init cpuset_init(void) -{ - int err = 0; - - cpus_setall(top_cpuset.cpus_allowed); - nodes_setall(top_cpuset.mems_allowed); - - fmeter_init(&top_cpuset.fmeter); - top_cpuset.mems_generation = cpuset_mems_generation++; - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; - - err = register_filesystem(&cpuset_fs_type); - if (err < 0) - return err; - - number_of_cpusets = 1; - return 0; -} - -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - struct cpuset_hotplug_scanner *chsp; - - chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); - cgroup_attach_task(chsp->to, tsk); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ - struct cpuset_hotplug_scanner scan; - - scan.scan.cg = from->css.cgroup; - scan.scan.test_task = NULL; /* select all tasks in cgroup */ - scan.scan.process_task = cpuset_do_move_task; - scan.scan.heap = NULL; - scan.to = to->css.cgroup; - - if (cgroup_scan_tasks(&scan.scan)) - printk(KERN_ERR "move_member_tasks_to_cpuset: " - "cgroup_scan_tasks failed\n"); -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = cs->parent; - while (cpus_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent->parent; - - move_member_tasks_to_cpuset(cs, parent); -} - -/* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. - * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. - * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. - * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. - */ -static void scan_for_empty_cpusets(const struct cpuset *root) -{ - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; - nodemask_t oldmems; - - list_add_tail((struct list_head *)&root->stack_list, &queue); - - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); - } - - /* Continue past cpusets with all cpus, mems online */ - if (cpus_subset(cp->cpus_allowed, cpu_online_map) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; - - oldmems = cp->mems_allowed; - - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); - - /* Move tasks from the empty cpuset to a parent */ - if (cpus_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems); - } - } -} - -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) -{ - struct sched_domain_attr *attr; - cpumask_t *doms; - int ndoms; - - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - - cgroup_lock(); - top_cpuset.cpus_allowed = cpu_online_map; - scan_for_empty_cpusets(&top_cpuset); - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). - */ -void cpuset_track_online_nodes(void) -{ - cgroup_lock(); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - cgroup_unlock(); -} -#endif - -/** - * cpuset_init_smp - initialize cpus_allowed - * - * Description: Finish top cpuset after cpu, node maps are initialized - **/ - -void __init cpuset_init_smp(void) -{ - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - - hotcpu_notifier(cpuset_track_online_cpus, 0); -} - -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the - * tasks cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) -{ - mutex_lock(&callback_mutex); - cpuset_cpus_allowed_locked(tsk, pmask); - mutex_unlock(&callback_mutex); -} - -/** - * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. - * Must be called with callback_mutex held. - **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) -{ - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); - task_unlock(tsk); -} - -void cpuset_init_current_mems_allowed(void) -{ - nodes_setall(current->mems_allowed); -} - -/** - * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. - * - * Description: Returns the nodemask_t mems_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of node_states[N_HIGH_MEMORY], even if this means going outside the - * tasks cpuset. - **/ - -nodemask_t cpuset_mems_allowed(struct task_struct *tsk) -{ - nodemask_t mask; - - mutex_lock(&callback_mutex); - task_lock(tsk); - guarantee_online_mems(task_cs(tsk), &mask); - task_unlock(tsk); - mutex_unlock(&callback_mutex); - - return mask; -} - -/** - * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed - * @nodemask: the nodemask to be checked - * - * Are any of the nodes in the nodemask allowed in current->mems_allowed? - */ -int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) -{ - return nodes_intersects(*nodemask, current->mems_allowed); -} - -/* - * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or - * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall - * (an unusual configuration), then returns the root cpuset. - */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) -{ - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; - return cs; -} - -/** - * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? - * @z: is this zone on an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If - * __GFP_THISNODE is set, yes, we can always allocate. If zone - * z's node is in our tasks mems_allowed, yes. If it's not a - * __GFP_HARDWALL request and this zone's nodes is in the nearest - * hardwalled cpuset ancestor to this tasks cpuset, yes. - * If the task has been OOM killed and has access to memory reserves - * as specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() - * reduces to cpuset_zone_allowed_hardwall(). Otherwise, - * cpuset_zone_allowed_softwall() might sleep, and might allow a zone - * from an enclosing cpuset. - * - * cpuset_zone_allowed_hardwall() only handles the simpler case of - * hardwall cpusets, and never sleeps. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * GFP_USER allocations are marked with the __GFP_HARDWALL bit, - * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. - * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest enclosing hardwalled ancestor cpuset. - * - * Scanning up parent cpusets requires callback_mutex. The - * __alloc_pages() routine only calls here with __GFP_HARDWALL bit - * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the - * current tasks mems_allowed came up empty on the first pass over - * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. - * - * The first call here from mm/page_alloc:get_page_from_freelist() - * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, - * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). - * - * The second pass through get_page_from_freelist() doesn't even call - * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() - * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set - * in alloc_flags. That logic and the checks below have the combined - * affect that: - * in_interrupt - any node ok (current task context irrelevant) - * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok - * GFP_KERNEL - any node in enclosing hardwalled cpuset ok - * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. - */ - -int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) -{ - int node; /* node that zone z is on */ - const struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ - - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - node = zone_to_nid(z); - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; - - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; - - /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); - - task_lock(current); - cs = nearest_hardwall_ancestor(task_cs(current)); - task_unlock(current); - - allowed = node_isset(node, cs->mems_allowed); - mutex_unlock(&callback_mutex); - return allowed; -} - -/* - * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? - * @z: is this zone on an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. - * If __GFP_THISNODE is set, yes, we can always allocate. If zone - * z's node is in our tasks mems_allowed, yes. If the task has been - * OOM killed and has access to memory reserves as specified by the - * TIF_MEMDIE flag, yes. Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_zone_allowed_softwall() variant, above, - * this variant requires that the zone be in the current tasks - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ - -int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) -{ - int node; /* node that zone z is on */ - - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - node = zone_to_nid(z); - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - -/** - * cpuset_lock - lock out any changes to cpuset structures - * - * The out of memory (oom) code needs to mutex_lock cpusets - * from being changed while it scans the tasklist looking for a - * task in an overlapping cpuset. Expose callback_mutex via this - * cpuset_lock() routine, so the oom code can lock it, before - * locking the task list. The tasklist_lock is a spinlock, so - * must be taken inside callback_mutex. - */ - -void cpuset_lock(void) -{ - mutex_lock(&callback_mutex); -} - -/** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - -/** - * cpuset_mem_spread_node() - On which node to begin search for a page - * - * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for - * tasks in a cpuset with is_spread_page or is_spread_slab set), - * and if the memory allocation used cpuset_mem_spread_node() - * to determine on which node to start looking, as it will for - * certain page cache or slab cache pages such as used for file - * system buffers and inode caches, then instead of starting on the - * local node to look for a free page, rather spread the starting - * node around the tasks mems_allowed nodes. - * - * We don't have to worry about the returned node being offline - * because "it can't happen", and even if it did, it would be ok. - * - * The routines calling guarantee_online_mems() are careful to - * only set nodes in task->mems_allowed that are online. So it - * should not be possible for the following code to return an - * offline node. But if it did, that would be ok, as this routine - * is not returning the node where the allocation must be, only - * the node where the search should start. The zonelist passed to - * __alloc_pages() will include all nodes. If the slab allocator - * is passed an offline node, it will fall back to the local node. - * See kmem_cache_alloc_node(). - */ - -int cpuset_mem_spread_node(void) -{ - int node; - - node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - current->cpuset_mem_spread_rotor = node; - return node; -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** - * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? - * @tsk1: pointer to task_struct of some task. - * @tsk2: pointer to task_struct of some other task. - * - * Description: Return true if @tsk1's mems_allowed intersects the - * mems_allowed of @tsk2. Used by the OOM killer to determine if - * one of the task's memory usage might impact the memory available - * to the other. - **/ - -int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) -{ - return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); -} - -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - **/ - -void __cpuset_memory_pressure_bump(void) -{ - task_lock(current); - fmeter_markevent(&task_cs(current)->fmeter); - task_unlock(current); -} - -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc//cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -static int proc_cpuset_show(struct seq_file *m, void *unused_v) -{ - struct pid *pid; - struct task_struct *tsk; - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) - goto out; - - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = -EINVAL; - cgroup_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); - if (retval < 0) - goto out_unlock; - seq_puts(m, buf); - seq_putc(m, '\n'); -out_unlock: - cgroup_unlock(); - put_task_struct(tsk); -out_free: - kfree(buf); -out: - return retval; -} - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROC_PID_CPUSET */ - -/* Display task cpus_allowed, mems_allowed in /proc//status file. */ -void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) -{ - seq_printf(m, "Cpus_allowed:\t"); - m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed:\t"); - m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); - m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); - seq_printf(m, "\n"); -} -/* delayacct.c - per-task delay accounting - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include - -int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -struct kmem_cache *delayacct_cache; - -static int __init delayacct_setup_disable(char *str) -{ - delayacct_on = 0; - return 1; -} -__setup("nodelayacct", delayacct_setup_disable); - -void delayacct_init(void) -{ - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); - delayacct_tsk_init(&init_task); -} - -void __delayacct_tsk_init(struct task_struct *tsk) -{ - tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); - if (tsk->delays) - spin_lock_init(&tsk->delays->lock); -} - -/* - * Start accounting for a delay statistic using - * its starting timestamp (@start) - */ - -static inline void delayacct_start(struct timespec *start) -{ - do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) -{ - struct timespec ts; - s64 ns; - unsigned long flags; - - do_posix_clock_monotonic_gettime(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); -} - -void __delayacct_blkio_start(void) -{ - delayacct_start(¤t->delays->blkio_start); -} - -void __delayacct_blkio_end(void) -{ - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); -} - -int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) -{ - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; - - /* Though tsk->delays accessed later, early exit avoids - * unnecessary returning of other data - */ - if (!tsk->delays) - goto done; - - tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); - tmp += timespec_to_ns(&ts); - d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - - tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); - tmp += timespec_to_ns(&ts); - d->cpu_scaled_run_real_total = - (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; - - /* - * No locking available for sched_info (and too expensive to add one) - * Mitigate by taking snapshot of values - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; - t3 = tsk->sched_info.cpu_time; - - d->cpu_count += t1; - - tmp = (s64)d->cpu_delay_total + t2; - d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - - tmp = (s64)d->cpu_run_virtual_total + t3; - d->cpu_run_virtual_total = - (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; - - /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - - spin_lock_irqsave(&tsk->delays->lock, flags); - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - spin_unlock_irqrestore(&tsk->delays->lock, flags); - -done: - return 0; -} - -__u64 __delayacct_blkio_ticks(struct task_struct *tsk) -{ - __u64 ret; - unsigned long flags; - - spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); - spin_unlock_irqrestore(&tsk->delays->lock, flags); - return ret; -} - -void __delayacct_freepages_start(void) -{ - delayacct_start(¤t->delays->freepages_start); -} - -void __delayacct_freepages_end(void) -{ - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); -} - -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include -#include - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ - * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. - * - * Written by Hennus Bergman, 1992. - * - * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. - * In the previous version the reported device could end up being wrong, - * if a device requested a DMA channel that was already in use. - * [It also happened to remove the sizeof(char *) == sizeof(int) - * assumption introduced because of those /proc/dma patches. -- Hennus] - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* A note on resource allocation: - * - * All drivers needing DMA channels, should allocate and release them - * through the public routines `request_dma()' and `free_dma()'. - * - * In order to avoid problems, all processes should allocate resources in - * the same sequence and release them in the reverse order. - * - * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. - * When releasing them, first release the DMA, then release the IRQ. - * If you don't, you may cause allocation requests to fail unnecessarily. - * This doesn't really matter now, but it will once we get real semaphores - * in the kernel. - */ - - -DEFINE_SPINLOCK(dma_spin_lock); - -/* - * If our port doesn't define this it has no PC like DMA - */ - -#ifdef MAX_DMA_CHANNELS - - -/* Channel n is busy iff dma_chan_busy[n].lock != 0. - * DMA0 used to be reserved for DRAM refresh, but apparently not any more... - * DMA4 is reserved for cascading. - */ - -struct dma_chan { - int lock; - const char *device_id; -}; - -static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { - [4] = { 1, "cascade" }, -}; - - -/** - * request_dma - request and reserve a system DMA channel - * @dmanr: DMA channel number - * @device_id: reserving device ID string, used in /proc/dma - */ -int request_dma(unsigned int dmanr, const char * device_id) -{ - if (dmanr >= MAX_DMA_CHANNELS) - return -EINVAL; - - if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) - return -EBUSY; - - dma_chan_busy[dmanr].device_id = device_id; - - /* old flag was 0, now contains 1 to indicate busy */ - return 0; -} /* request_dma */ - -/** - * free_dma - free a reserved system DMA channel - * @dmanr: DMA channel number - */ -void free_dma(unsigned int dmanr) -{ - if (dmanr >= MAX_DMA_CHANNELS) { - printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); - return; - } - - if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { - printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); - return; - } - -} /* free_dma */ - -#else - -int request_dma(unsigned int dmanr, const char *device_id) -{ - return -EINVAL; -} - -void free_dma(unsigned int dmanr) -{ -} - -#endif - -#ifdef CONFIG_PROC_FS - -#ifdef MAX_DMA_CHANNELS -static int proc_dma_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { - if (dma_chan_busy[i].lock) { - seq_printf(m, "%2d: %s\n", i, - dma_chan_busy[i].device_id); - } - } - return 0; -} -#else -static int proc_dma_show(struct seq_file *m, void *v) -{ - seq_puts(m, "No DMA\n"); - return 0; -} -#endif /* MAX_DMA_CHANNELS */ - -static int proc_dma_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_dma_show, NULL); -} - -static const struct file_operations proc_dma_operations = { - .open = proc_dma_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_dma_init(void) -{ - proc_create("dma", 0, NULL, &proc_dma_operations); - return 0; -} - -__initcall(proc_dma_init); -#endif - -EXPORT_SYMBOL(request_dma); -EXPORT_SYMBOL(free_dma); -EXPORT_SYMBOL(dma_spin_lock); -/* - * Handling of different ABIs (personalities). - * - * We group personalities into execution domains which have their - * own handlers for kernel entry points, signal mapping, etc... - * - * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static void default_handler(int, struct pt_regs *); - -static struct exec_domain *exec_domains = &default_exec_domain; -static DEFINE_RWLOCK(exec_domains_lock); - - -static u_long ident_map[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -struct exec_domain default_exec_domain = { - .name = "Linux", /* name */ - .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ - .pers_high = 0, /* PER_LINUX personality. */ - .signal_map = ident_map, /* Identity map signals. */ - .signal_invmap = ident_map, /* - both ways. */ -}; - - -static void -default_handler(int segment, struct pt_regs *regp) -{ - set_personality(0); - - if (current_thread_info()->exec_domain->handler != default_handler) - current_thread_info()->exec_domain->handler(segment, regp); - else - send_sig(SIGSEGV, current, 1); -} - -static struct exec_domain * -lookup_exec_domain(u_long personality) -{ - struct exec_domain * ep; - u_long pers = personality(personality); - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } - -#ifdef CONFIG_MODULES - read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); - read_lock(&exec_domains_lock); - - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } -#endif - - ep = &default_exec_domain; -out: - read_unlock(&exec_domains_lock); - return (ep); -} - -int -register_exec_domain(struct exec_domain *ep) -{ - struct exec_domain *tmp; - int err = -EBUSY; - - if (ep == NULL) - return -EINVAL; - - if (ep->next != NULL) - return -EBUSY; - - write_lock(&exec_domains_lock); - for (tmp = exec_domains; tmp; tmp = tmp->next) { - if (tmp == ep) - goto out; - } - - ep->next = exec_domains; - exec_domains = ep; - err = 0; - -out: - write_unlock(&exec_domains_lock); - return (err); -} - -int -unregister_exec_domain(struct exec_domain *ep) -{ - struct exec_domain **epp; - - epp = &exec_domains; - write_lock(&exec_domains_lock); - for (epp = &exec_domains; *epp; epp = &(*epp)->next) { - if (ep == *epp) - goto unregister; - } - write_unlock(&exec_domains_lock); - return -EINVAL; - -unregister: - *epp = ep->next; - ep->next = NULL; - write_unlock(&exec_domains_lock); - return 0; -} - -int -__set_personality(u_long personality) -{ - struct exec_domain *ep, *oep; - - ep = lookup_exec_domain(personality); - if (ep == current_thread_info()->exec_domain) { - current->personality = personality; - module_put(ep->module); - return 0; - } - - if (atomic_read(¤t->fs->count) != 1) { - struct fs_struct *fsp, *ofsp; - - fsp = copy_fs_struct(current->fs); - if (fsp == NULL) { - module_put(ep->module); - return -ENOMEM; - } - - task_lock(current); - ofsp = current->fs; - current->fs = fsp; - task_unlock(current); - - put_fs_struct(ofsp); - } - - /* - * At that point we are guaranteed to be the sole owner of - * current->fs. - */ - - current->personality = personality; - oep = current_thread_info()->exec_domain; - current_thread_info()->exec_domain = ep; - - module_put(oep->module); - return 0; -} - -int -get_exec_domain_list(char *page) -{ - struct exec_domain *ep; - int len = 0; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) - len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", - ep->pers_low, ep->pers_high, ep->name, - module_name(ep->module)); - read_unlock(&exec_domains_lock); - return (len); -} - -SYSCALL_DEFINE1(personality, u_long, personality) -{ - u_long old = current->personality; - - if (personality != 0xffffffff) { - set_personality(personality); - if (current->personality != personality) - return -EINVAL; - } - - return (long)old; -} - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); -/* - * linux/kernel/exit.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for audit_free() */ -#include -#include -#include -#include - -#include -#include -#include -#include - -static void exit_mm(struct task_struct * tsk); - -static inline int task_detached(struct task_struct *p) -{ - return p->exit_signal == -1; -} - -static void __unhash_process(struct task_struct *p) -{ - nr_threads--; - detach_pid(p, PIDTYPE_PID); - if (thread_group_leader(p)) { - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); - - list_del_rcu(&p->tasks); - __get_cpu_var(process_counts)--; - } - list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); -} - -/* - * This function expects the tasklist_lock write-locked. - */ -static void __exit_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct sighand_struct *sighand; - - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); - - sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - - posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) - posix_cpu_timers_exit_group(tsk); - else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - - /* - * If there is any task waiting for the group exit - * then notify it: - */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) - wake_up_process(sig->group_exit_task); - - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - sig = NULL; /* Marker for below. */ - } - - __unhash_process(tsk); - - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); - - tsk->signal = NULL; - tsk->sighand = NULL; - spin_unlock(&sighand->siglock); - - __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (sig) { - flush_sigqueue(&sig->shared_pending); - taskstats_tgid_free(sig); - __cleanup_signal(sig); - } -} - -static void delayed_put_task_struct(struct rcu_head *rhp) -{ - put_task_struct(container_of(rhp, struct task_struct, rcu)); -} - - -void release_task(struct task_struct * p) -{ - struct task_struct *leader; - int zap_leader; -repeat: - tracehook_prepare_release_task(p); - atomic_dec(&p->user->processes); - proc_flush_task(p); - write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); - __exit_signal(p); - - /* - * If we are the last non-leader member of the thread - * group, and the leader is zombie, then notify the - * group leader's parent process. (if it wants notification.) - */ - zap_leader = 0; - leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); - /* - * If we were the last child thread and the leader has - * exited already, and the leader's parent ignores SIGCHLD, - * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. - */ - if (zap_leader) - leader->exit_state = EXIT_DEAD; - } - - write_unlock_irq(&tasklist_lock); - release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); - - p = leader; - if (unlikely(zap_leader)) - goto repeat; -} - -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* - * Determine if a process group is "orphaned", according to the POSIX - * definition in 2.2.2.52. Orphaned process groups are not to be affected - * by terminal-generated stop signals. Newly orphaned process groups are - * to receive a SIGHUP and a SIGCONT. - * - * "I ask you, have you ever known what it is to be an orphan?" - */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) -{ - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if ((p == ignored_task) || - (p->exit_state && thread_group_empty(p)) || - is_global_init(p->real_parent)) - continue; - - if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) - return 0; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - - return 1; -} - -int is_current_pgrp_orphaned(void) -{ - int retval; - - read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); - read_unlock(&tasklist_lock); - - return retval; -} - -static int has_stopped_jobs(struct pid *pgrp) -{ - int retval = 0; - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; -} - -/* - * Check to see if any process groups have become orphaned as - * a result of our exiting, and if they have any stopped jobs, - * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ -static void -kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) -{ - struct pid *pgrp = task_pgrp(tsk); - struct task_struct *ignored_task = tsk; - - if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ - parent = tsk->real_parent; - else - /* reparent: our child is in a different pgrp than - * we are, and it was the only connection outside. - */ - ignored_task = NULL; - - if (task_pgrp(parent) != pgrp && - task_session(parent) == task_session(tsk) && - will_become_orphaned_pgrp(pgrp, ignored_task) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } -} - -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - security_task_reparent_to_init(current); - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - atomic_inc(&(INIT_USER->__count)); - write_unlock_irq(&tasklist_lock); - switch_uid(INIT_USER); -} - -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - pid_t nr = pid_nr(pid); - - if (task_session(curr) != pid) { - change_pid(curr, PIDTYPE_SID, pid); - set_task_session(curr, nr); - } - if (task_pgrp(curr) != pid) { - change_pid(curr, PIDTYPE_PGID, pid); - set_task_pgrp(curr, nr); - } -} - -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - -/* - * Let kernel threads use this to say that they - * allow a certain signal (since daemonize() will - * have disabled all of them by default). - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - sigdelset(¤t->blocked, sig); - if (!current->mm) { - /* Kernel threads handle their own signals. - Let the signal code know it'll be handled, so - that they don't get converted to SIGKILL or - just silently dropped */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - struct fs_struct *fs; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. - */ - fdt = files_fdtable(files); - for (;;) { - unsigned long set; - i = j * __NFDBITS; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds->fds_bits[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - -void put_fs_struct(struct fs_struct *fs) -{ - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { - path_put(&fs->root); - path_put(&fs->pwd); - kmem_cache_free(fs_cachep, fs); - } -} - -void exit_fs(struct task_struct *tsk) -{ - struct fs_struct * fs = tsk->fs; - - if (fs) { - task_lock(tsk); - tsk->fs = NULL; - task_unlock(tsk); - put_fs_struct(fs); - } -} - -EXPORT_SYMBOL_GPL(exit_fs); - -#ifdef CONFIG_MM_OWNER -/* - * Task p is exiting and it owned mm, lets find a new owner for it - */ -static inline int -mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) -{ - /* - * If there are other users of the mm and the owner (us) is exiting - * we need to find a new owner to take on the responsibility. - */ - if (atomic_read(&mm->mm_users) <= 1) - return 0; - if (mm->owner != p) - return 0; - return 1; -} - -void mm_update_next_owner(struct mm_struct *mm) -{ - struct task_struct *c, *g, *p = current; - -retry: - if (!mm_need_new_owner(mm, p)) - return; - - read_lock(&tasklist_lock); - /* - * Search in the children - */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search in the siblings - */ - list_for_each_entry(c, &p->parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search through everything else. We should not get - * here often - */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - - read_unlock(&tasklist_lock); - /* - * We found no owner yet mm_users > 1: this implies that we are - * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. - */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); - mm->owner = NULL; - up_write(&mm->mmap_sem); - return; - -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - cgroup_mm_owner_callbacks(mm->owner, c); - mm->owner = c; - task_unlock(c); - put_task_struct(c); -} -#endif /* CONFIG_MM_OWNER */ - -/* - * Turn us into a lazy TLB process if we - * aren't already.. - */ -static void exit_mm(struct task_struct * tsk) -{ - struct mm_struct *mm = tsk->mm; - struct core_state *core_state; - - mm_release(tsk, mm); - if (!mm) - return; - /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - up_read(&mm->mmap_sem); - - self.task = tsk; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_task_state(tsk, TASK_RUNNING); - down_read(&mm->mmap_sem); - } - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); - /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; - up_read(&mm->mmap_sem); - enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); - task_unlock(tsk); - mm_update_next_owner(mm); - mmput(mm); -} - -/* - * Return nonzero if @parent's children should reap themselves. - * - * Called with write_lock_irq(&tasklist_lock) held. - */ -static int ignoring_children(struct task_struct *parent) -{ - int ret; - struct sighand_struct *psig = parent->sighand; - unsigned long flags; - spin_lock_irqsave(&psig->siglock, flags); - ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || - (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); - spin_unlock_irqrestore(&psig->siglock, flags); - return ret; -} - -/* - * Detach all tasks we were using ptrace on. - * Any that need to be release_task'd are put on the @dead list. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_exit(struct task_struct *parent, struct list_head *dead) -{ - struct task_struct *p, *n; - int ign = -1; - - list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { - __ptrace_unlink(p); - - if (p->exit_state != EXIT_ZOMBIE) - continue; - - /* - * If it's a zombie, our attachedness prevented normal - * parent notification or self-reaping. Do notification - * now if it would have happened earlier. If it should - * reap itself, add it to the @dead list. We can't call - * release_task() here because we already hold tasklist_lock. - * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. - */ - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, parent)) - do_notify_parent(p, p->exit_signal); - else { - if (ign < 0) - ign = ignoring_children(parent); - if (ign) - p->exit_signal = -1; - } - } - - if (task_detached(p)) { - /* - * Mark it as in the process of being reaped. - */ - p->exit_state = EXIT_DEAD; - list_add(&p->ptrace_entry, dead); - } - } -} - -/* - * Finish up exit-time ptrace cleanup. - * - * Called without locks. - */ -static void ptrace_exit_finish(struct task_struct *parent, - struct list_head *dead) -{ - struct task_struct *p, *n; - - BUG_ON(!list_empty(&parent->ptraced)); - - list_for_each_entry_safe(p, n, dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } -} - -static void reparent_thread(struct task_struct *p, struct task_struct *father) -{ - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - - list_move_tail(&p->sibling, &p->real_parent->children); - - /* If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) - return; - - /* We don't want people slaying init. */ - if (!task_detached(p)) - p->exit_signal = SIGCHLD; - - /* If we'd notified the old parent about this child's death, - * also notify the new parent. - */ - if (!ptrace_reparented(p) && - p->exit_state == EXIT_ZOMBIE && - !task_detached(p) && thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - - kill_orphaned_pgrp(p, father); -} - -/* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. - */ -static struct task_struct *find_new_reaper(struct task_struct *father) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; - - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; - return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; - } - - return pid_ns->child_reaper; -} - -static void forget_original_parent(struct task_struct *father) -{ - struct task_struct *p, *n, *reaper; - LIST_HEAD(ptrace_dead); - - write_lock_irq(&tasklist_lock); - reaper = find_new_reaper(father); - /* - * First clean up ptrace if we were using it. - */ - ptrace_exit(father, &ptrace_dead); - - list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(p->ptrace); - p->parent = p->real_parent; - } - reparent_thread(p, father); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&father->children)); - - ptrace_exit_finish(father, &ptrace_dead); -} - -/* - * Send signals to all our closest relatives so that they know - * to properly mourn us.. - */ -static void exit_notify(struct task_struct *tsk, int group_dead) -{ - int signal; - void *cookie; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); - exit_task_namespaces(tsk); - - write_lock_irq(&tasklist_lock); - if (group_dead) - kill_orphaned_pgrp(tsk->group_leader, NULL); - - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. - */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) - tsk->exit_signal = SIGCHLD; - - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - signal = do_notify_parent(tsk, signal); - - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; - - /* mt-exec, de_thread() is waiting for us */ - if (thread_group_leader(tsk) && - tsk->signal->group_exit_task && - tsk->signal->notify_count < 0) - wake_up_process(tsk->signal->group_exit_task); - - write_unlock_irq(&tasklist_lock); - - tracehook_report_death(tsk, signal, cookie, group_dead); - - /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) - release_task(tsk); -} - -#ifdef CONFIG_DEBUG_STACK_USAGE -static void check_stack_usage(void) -{ - static DEFINE_SPINLOCK(low_water_lock); - static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); - unsigned long free; - - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); - - if (free >= lowest_to_date) - return; - - spin_lock(&low_water_lock); - if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); - lowest_to_date = free; - } - spin_unlock(&low_water_lock); -} -#else -static inline void check_stack_usage(void) {} -#endif - -NORET_TYPE void do_exit(long code) -{ - struct task_struct *tsk = current; - int group_dead; - - profile_task_exit(tsk); - - WARN_ON(atomic_read(&tsk->fs_excl)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - set_fs(USER_DS); - - tracehook_report_exit(&code); - - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; - if (tsk->io_context) - exit_io_context(); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - - exit_signals(tsk); /* sets PF_EXITING */ - /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. - */ - smp_mb(); - spin_unlock_wait(&tsk->pi_lock); - - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - - acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } - group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) { - hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); - } - acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif - if (group_dead) - tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); - - tsk->exit_code = code; - taskstats_exit(tsk, group_dead); - - exit_mm(tsk); - - if (group_dead) - acct_process(); - exit_sem(tsk); - exit_files(tsk); - exit_fs(tsk); - check_stack_usage(); - exit_thread(); - cgroup_exit(tsk, 1); - exit_keys(tsk); - - if (group_dead && tsk->signal->leader) - disassociate_ctty(1); - - module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); - - proc_exit_connector(tsk); - exit_notify(tsk, group_dead); -#ifdef CONFIG_NUMA - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; -#endif -#ifdef CONFIG_FUTEX - /* - * This must happen late, after the PID is not - * hashed anymore: - */ - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); - if (unlikely(current->pi_state_cache)) - kfree(current->pi_state_cache); -#endif - /* - * Make sure we are holding no locks: - */ - debug_check_no_locks_held(tsk); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; - - if (tsk->io_context) - exit_io_context(); - - if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); - - preempt_disable(); - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ -} - -EXPORT_SYMBOL_GPL(do_exit); - -NORET_TYPE void complete_and_exit(struct completion *comp, long code) -{ - if (comp) - complete(comp); - - do_exit(code); -} - -EXPORT_SYMBOL(complete_and_exit); - -SYSCALL_DEFINE1(exit, int, error_code) -{ - do_exit((error_code&0xff)<<8); -} - -/* - * Take down every thread in the group. This is called by fatal signals - * as well as by sys_exit_group (below). - */ -NORET_TYPE void -do_group_exit(int exit_code) -{ - struct signal_struct *sig = current->signal; - - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) - exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { - struct sighand_struct *const sighand = current->sighand; - spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) - /* Another thread got here before we took the lock. */ - exit_code = sig->group_exit_code; - else { - sig->group_exit_code = exit_code; - sig->flags = SIGNAL_GROUP_EXIT; - zap_other_threads(current); - } - spin_unlock_irq(&sighand->siglock); - } - - do_exit(exit_code); - /* NOTREACHED */ -} - -/* - * this kills every thread in the thread group. Note that any externally - * wait4()-ing process will get the correct exit code - even if this - * thread is not the thread group leader. - */ -SYSCALL_DEFINE1(exit_group, int, error_code) -{ - do_group_exit((error_code & 0xff) << 8); - /* NOTREACHED */ - return 0; -} - -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; -} - -static int eligible_child(enum pid_type type, struct pid *pid, int options, - struct task_struct *p) -{ - int err; - - if (type < PIDTYPE_MAX) { - if (task_pid_type(p, type) != pid) - return 0; - } - - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) - && !(options & __WALL)) - return 0; - - err = security_task_wait(p); - if (err) - return err; - - return 1; -} - -static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, - int why, int status, - struct siginfo __user *infop, - struct rusage __user *rusagep) -{ - int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; - - put_task_struct(p); - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); - if (!retval) - retval = pid; - return retval; -} - -/* - * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_zombie(struct task_struct *p, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - unsigned long state; - int retval, status, traced; - pid_t pid = task_pid_vnr(p); - - if (!likely(options & WEXITED)) - return 0; - - if (unlikely(options & WNOWAIT)) { - uid_t uid = p->uid; - int exit_code = p->exit_code; - int why, status; - - get_task_struct(p); - read_unlock(&tasklist_lock); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; - } - return wait_noreap_copyout(p, pid, uid, why, - status, infop, ru); - } - - /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: - */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); - return 0; - } - - traced = ptrace_reparented(p); - - if (likely(!traced)) { - struct signal_struct *psig; - struct signal_struct *sig; - - /* - * The resource counters for the group leader are in its - * own task_struct. Those for dead threads in the group - * are in its signal_struct, as are those for the child - * processes it has previously reaped. All these - * accumulate in the parent's signal_struct c* fields. - * - * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to p->parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. - */ - spin_lock_irq(&p->parent->sighand->siglock); - psig = p->parent->signal; - sig = p->signal; - psig->cutime = - cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); - psig->cstime = - cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); - psig->cgtime = - cputime_add(psig->cgtime, - cputime_add(p->gtime, - cputime_add(sig->gtime, - sig->cgtime))); - psig->cmin_flt += - p->min_flt + sig->min_flt + sig->cmin_flt; - psig->cmaj_flt += - p->maj_flt + sig->maj_flt + sig->cmaj_flt; - psig->cnvcsw += - p->nvcsw + sig->nvcsw + sig->cnvcsw; - psig->cnivcsw += - p->nivcsw + sig->nivcsw + sig->cnivcsw; - psig->cinblock += - task_io_get_inblock(p) + - sig->inblock + sig->cinblock; - psig->coublock += - task_io_get_oublock(p) + - sig->oublock + sig->coublock; - task_io_accounting_add(&psig->ioac, &p->ioac); - task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->parent->sighand->siglock); - } - - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. - */ - read_unlock(&tasklist_lock); - - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - status = (p->signal->flags & SIGNAL_GROUP_EXIT) - ? p->signal->group_exit_code : p->exit_code; - if (!retval && stat_addr) - retval = put_user(status, stat_addr); - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - - if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; - } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; - } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); - if (!retval) - retval = pid; - - if (traced) { - write_lock_irq(&tasklist_lock); - /* We dropped tasklist, ptracer could die and untrace */ - ptrace_unlink(p); - /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. - */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } - } - write_unlock_irq(&tasklist_lock); - } - if (p != NULL) - release_task(p); - - return retval; -} - -/* - * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_stopped(int ptrace, struct task_struct *p, - int options, struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int retval, exit_code, why; - uid_t uid = 0; /* unneeded, required by compiler */ - pid_t pid; - - if (!(options & WUNTRACED)) - return 0; - - exit_code = 0; - spin_lock_irq(&p->sighand->siglock); - - if (unlikely(!task_is_stopped_or_traced(p))) - goto unlock_sig; - - if (!ptrace && p->signal->group_stop_count > 0) - /* - * A group stop is in progress and this is the group leader. - * We won't report until all threads have stopped. - */ - goto unlock_sig; - - exit_code = p->exit_code; - if (!exit_code) - goto unlock_sig; - - if (!unlikely(options & WNOWAIT)) - p->exit_code = 0; - - uid = p->uid; -unlock_sig: - spin_unlock_irq(&p->sighand->siglock); - if (!exit_code) - return 0; - - /* - * Now we are pretty sure this task is interesting. - * Make sure it doesn't get reaped out from under us while we - * give up the lock and then examine it below. We don't want to - * keep holding onto the tasklist_lock while we call getrusage and - * possibly take page faults for user memory. - */ - get_task_struct(p); - pid = task_pid_vnr(p); - why = ptrace ? CLD_TRAPPED : CLD_STOPPED; - read_unlock(&tasklist_lock); - - if (unlikely(options & WNOWAIT)) - return wait_noreap_copyout(p, pid, uid, - why, exit_code, - infop, ru); - - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - if (!retval && stat_addr) - retval = put_user((exit_code << 8) | 0x7f, stat_addr); - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - put_task_struct(p); - - BUG_ON(!retval); - return retval; -} - -/* - * Handle do_wait work for one task in a live, non-stopped state. - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_continued(struct task_struct *p, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int retval; - pid_t pid; - uid_t uid; - - if (!unlikely(options & WCONTINUED)) - return 0; - - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) - return 0; - - spin_lock_irq(&p->sighand->siglock); - /* Re-check with the lock held. */ - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { - spin_unlock_irq(&p->sighand->siglock); - return 0; - } - if (!unlikely(options & WNOWAIT)) - p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - spin_unlock_irq(&p->sighand->siglock); - - pid = task_pid_vnr(p); - uid = p->uid; - get_task_struct(p); - read_unlock(&tasklist_lock); - - if (!infop) { - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - put_task_struct(p); - if (!retval && stat_addr) - retval = put_user(0xffff, stat_addr); - if (!retval) - retval = pid; - } else { - retval = wait_noreap_copyout(p, pid, uid, - CLD_CONTINUED, SIGCONT, - infop, ru); - BUG_ON(retval == 0); - } - - return retval; -} - -/* - * Consider @p for a wait by @parent. - * - * -ECHILD should be in *@notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; - * then *@notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int wait_consider_task(struct task_struct *parent, int ptrace, - struct task_struct *p, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) -{ - int ret = eligible_child(type, pid, options, p); - if (!ret) - return ret; - - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (*notask_error) - *notask_error = ret; - } - - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* - * This child is hidden by ptrace. - * We aren't allowed to see it now, but eventually we will. - */ - *notask_error = 0; - return 0; - } - - if (p->exit_state == EXIT_DEAD) - return 0; - - /* - * We don't reap group leaders with subthreads. - */ - if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) - return wait_task_zombie(p, options, infop, stat_addr, ru); - - /* - * It's stopped or running now, so it might - * later continue, exit, or stop again. - */ - *notask_error = 0; - - if (task_is_stopped_or_traced(p)) - return wait_task_stopped(ptrace, p, options, - infop, stat_addr, ru); - - return wait_task_continued(p, options, infop, stat_addr, ru); -} - -/* - * Do the work of do_wait() for one thread in the group, @tsk. - * - * -ECHILD should be in *@notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; then - * *@notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int do_wait_thread(struct task_struct *tsk, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(tsk, 0, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; - } - } - - return 0; -} - -static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, - enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - struct task_struct *p; - - /* - * Traditionally we see ptrace'd stopped tasks regardless of options. - */ - options |= WUNTRACED; - - list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(tsk, 1, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; - } - - return 0; -} - -static long do_wait(enum pid_type type, struct pid *pid, int options, - struct siginfo __user *infop, int __user *stat_addr, - struct rusage __user *ru) -{ - DECLARE_WAITQUEUE(wait, current); - struct task_struct *tsk; - int retval; - - add_wait_queue(¤t->signal->wait_chldexit,&wait); -repeat: - /* - * If there is nothing that can match our critiera just get out. - * We will clear @retval to zero if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - retval = -ECHILD; - if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) - goto end; - - current->state = TASK_INTERRUPTIBLE; - read_lock(&tasklist_lock); - tsk = current; - do { - int tsk_result = do_wait_thread(tsk, &retval, - type, pid, options, - infop, stat_addr, ru); - if (!tsk_result) - tsk_result = ptrace_do_wait(tsk, &retval, - type, pid, options, - infop, stat_addr, ru); - if (tsk_result) { - /* - * tasklist_lock is unlocked and we have a final result. - */ - retval = tsk_result; - goto end; - } - - if (options & __WNOTHREAD) - break; - tsk = next_thread(tsk); - BUG_ON(tsk->signal != current->signal); - } while (tsk != current); - read_unlock(&tasklist_lock); - - if (!retval && !(options & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } - -end: - current->state = TASK_RUNNING; - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (infop) { - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } - return retval; -} - -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) -{ - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) - return -EINVAL; - if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) - return -EINVAL; - - switch (which) { - case P_ALL: - type = PIDTYPE_MAX; - break; - case P_PID: - type = PIDTYPE_PID; - if (upid <= 0) - return -EINVAL; - break; - case P_PGID: - type = PIDTYPE_PGID; - if (upid <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); - ret = do_wait(type, pid, options, infop, NULL, ru); - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); - return ret; -} - -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) -{ - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| - __WNOTHREAD|__WCLONE|__WALL)) - return -EINVAL; - - if (upid == -1) - type = PIDTYPE_MAX; - else if (upid < 0) { - type = PIDTYPE_PGID; - pid = find_get_pid(-upid); - } else if (upid == 0) { - type = PIDTYPE_PGID; - pid = get_pid(task_pgrp(current)); - } else /* upid > 0 */ { - type = PIDTYPE_PID; - pid = find_get_pid(upid); - } - - ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); - return ret; -} - -#ifdef __ARCH_WANT_SYS_WAITPID - -/* - * sys_waitpid() remains for compatibility. waitpid() should be - * implemented by calling sys_wait4() from libc.a. - */ -SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) -{ - return sys_wait4(pid, stat_addr, options, NULL); -} - -#endif -/* Rewritten by Rusty Russell, on the backs of many others... - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include - -extern struct exception_table_entry __start___ex_table[]; -extern struct exception_table_entry __stop___ex_table[]; - -/* Sort the kernel's built-in exception table */ -void __init sort_main_extable(void) -{ - sort_extable(__start___ex_table, __stop___ex_table); -} - -/* Given an address, look for it in the exception tables. */ -const struct exception_table_entry *search_exception_tables(unsigned long addr) -{ - const struct exception_table_entry *e; - - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); - if (!e) - e = search_module_extables(addr); - return e; -} - -int core_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) - return 1; - - if (system_state == SYSTEM_BOOTING && - addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -int __kernel_text_address(unsigned long addr) -{ - if (core_kernel_text(addr)) - return 1; - return __module_text_address(addr) != NULL; -} - -int kernel_text_address(unsigned long addr) -{ - if (core_kernel_text(addr)) - return 1; - return module_text_address(addr) != NULL; -} -/* - * linux/kernel/fork.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * 'fork.c' contains the help-routines for the 'fork' system call - * (see also entry.S and others). - * Fork is rather simple, once you get the hang of it, but the memory - * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -/* - * Protected counters by write_lock_irq(&tasklist_lock) - */ -unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ - -int max_threads; /* tunable limit on nr_threads */ - -DEFINE_PER_CPU(unsigned long, process_counts) = 0; - -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -int nr_processes(void) -{ - int cpu; - int total = 0; - - for_each_online_cpu(cpu) - total += per_cpu(process_counts, cpu); - - return total; -} - -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) -static struct kmem_cache *task_struct_cachep; -#endif - -#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) -{ -#ifdef CONFIG_DEBUG_STACK_USAGE - gfp_t mask = GFP_KERNEL | __GFP_ZERO; -#else - gfp_t mask = GFP_KERNEL; -#endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); -} - -static inline void free_thread_info(struct thread_info *ti) -{ - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); -} -#endif - -/* SLAB cache for signal_struct structures (tsk->signal) */ -static struct kmem_cache *signal_cachep; - -/* SLAB cache for sighand_struct structures (tsk->sighand) */ -struct kmem_cache *sighand_cachep; - -/* SLAB cache for files_struct structures (tsk->files) */ -struct kmem_cache *files_cachep; - -/* SLAB cache for fs_struct structures (tsk->fs) */ -struct kmem_cache *fs_cachep; - -/* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; - -/* SLAB cache for mm_struct structures (tsk->mm) */ -static struct kmem_cache *mm_cachep; - -void free_task(struct task_struct *tsk) -{ - prop_local_destroy_single(&tsk->dirties); - free_thread_info(tsk->stack); - rt_mutex_debug_task_free(tsk); - free_task_struct(tsk); -} -EXPORT_SYMBOL(free_task); - -void __put_task_struct(struct task_struct *tsk) -{ - WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); - WARN_ON(tsk == current); - - security_task_free(tsk); - free_uid(tsk->user); - put_group_info(tsk->group_info); - delayacct_tsk_free(tsk); - - if (!profile_handoff_task(tsk)) - free_task(tsk); -} - -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif - -void __init fork_init(unsigned long mempages) -{ -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -#ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES -#endif - /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); -#endif - - /* do the arch specific task caches init */ - arch_task_cache_init(); - - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if(max_threads < 20) - max_threads = 20; - - init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; - init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; - init_task.signal->rlim[RLIMIT_SIGPENDING] = - init_task.signal->rlim[RLIMIT_NPROC]; -} - -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, - struct task_struct *src) -{ - *dst = *src; - return 0; -} - -static struct task_struct *dup_task_struct(struct task_struct *orig) -{ - struct task_struct *tsk; - struct thread_info *ti; - int err; - - prepare_to_copy(orig); - - tsk = alloc_task_struct(); - if (!tsk) - return NULL; - - ti = alloc_thread_info(tsk); - if (!ti) { - free_task_struct(tsk); - return NULL; - } - - err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; - - tsk->stack = ti; - - err = prop_local_init_single(&tsk->dirties); - if (err) - goto out; - - setup_thread_stack(tsk, orig); - -#ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); -#endif - - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); -#ifdef CONFIG_BLK_DEV_IO_TRACE - tsk->btrace_seq = 0; -#endif - tsk->splice_pipe = NULL; - return tsk; - -out: - free_thread_info(ti); - free_task_struct(tsk); - return NULL; -} - -#ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - struct mempolicy *pol; - - down_write(&oldmm->mmap_sem); - flush_cache_dup_mm(oldmm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - mm->locked_vm = 0; - mm->mmap = NULL; - mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; - mm->map_count = 0; - cpus_clear(mm->cpu_vm_mask); - mm->mm_rb = RB_ROOT; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) - goto fail_nomem_policy; - vma_set_policy(tmp, pol); - tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; - tmp->vm_next = NULL; - anon_vma_link(tmp); - file = tmp->vm_file; - if (file) { - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); - if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); - flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - return retval; -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct * mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct * mm) -{ - pgd_free(mm, mm->pgd); -} -#else -#define dup_mmap(mm, oldmm) (0) -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - -__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); - -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - -#include - -static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) -{ - atomic_set(&mm->mm_users, 1); - atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); - INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; - mm->core_state = NULL; - mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); - spin_lock_init(&mm->page_table_lock); - rwlock_init(&mm->ioctx_list_lock); - mm->ioctx_list = NULL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - mm_init_owner(mm, p); - - if (likely(!mm_alloc_pgd(mm))) { - mm->def_flags = 0; - mmu_notifier_mm_init(mm); - return mm; - } - - free_mm(mm); - return NULL; -} - -/* - * Allocate and initialize an mm_struct. - */ -struct mm_struct * mm_alloc(void) -{ - struct mm_struct * mm; - - mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); - } - return mm; -} - -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - mmu_notifier_mm_destroy(mm); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - -/* - * Decrement the use count and release all resources for an mm. - */ -void mmput(struct mm_struct *mm) -{ - might_sleep(); - - if (atomic_dec_and_test(&mm->mm_users)) { - exit_aio(mm); - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - put_swap_token(mm); - mmdrop(mm); - } -} -EXPORT_SYMBOL_GPL(mmput); - -/** - * get_task_mm - acquire a reference to the task's mm - * - * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning - * this kernel workthread has transiently adopted a user mm with use_mm, - * to do its AIO) is not set and if so returns a reference to it, after - * bumping up the use count. User must release the mm via mmput() - * after use. Typically used by /proc and ptrace. - */ -struct mm_struct *get_task_mm(struct task_struct *task) -{ - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) { - if (task->flags & PF_KTHREAD) - mm = NULL; - else - atomic_inc(&mm->mm_users); - } - task_unlock(task); - return mm; -} -EXPORT_SYMBOL_GPL(get_task_mm); - -/* Please note the differences between mmput and mm_release. - * mmput is called whenever we stop holding onto a mm_struct, - * error success whatever. - * - * mm_release is called after a mm_struct has been removed - * from the current process. - * - * This difference is important for error handling, when we - * only half set up a mm_struct for a new process and need to restore - * the old one. Because we mmput the new mm_struct before - * restoring the old one. . . - * Eric Biederman 10 January 1998 - */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) -{ - struct completion *vfork_done = tsk->vfork_done; - - /* Get rid of any cached register state */ - deactivate_mm(tsk, mm); - - /* notify parent sleeping on vfork() */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } - - /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. - */ - if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && - atomic_read(&mm->mm_users) > 1) { - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, - 1, NULL, NULL, 0); - } - tsk->clear_child_tid = NULL; - } -} - -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. - */ -struct mm_struct *dup_mm(struct task_struct *tsk) -{ - struct mm_struct *mm, *oldmm = current->mm; - int err; - - if (!oldmm) - return NULL; - - mm = allocate_mm(); - if (!mm) - goto fail_nomem; - - memcpy(mm, oldmm, sizeof(*mm)); - - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - - if (!mm_init(mm, tsk)) - goto fail_nomem; - - if (init_new_context(tsk, mm)) - goto fail_nocontext; - - dup_mm_exe_file(oldmm, mm); - - err = dup_mmap(mm, oldmm); - if (err) - goto free_pt; - - mm->hiwater_rss = get_mm_rss(mm); - mm->hiwater_vm = mm->total_vm; - - return mm; - -free_pt: - mmput(mm); - -fail_nomem: - return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; -} - -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) -{ - struct mm_struct * mm, *oldmm; - int retval; - - tsk->min_flt = tsk->maj_flt = 0; - tsk->nvcsw = tsk->nivcsw = 0; - - tsk->mm = NULL; - tsk->active_mm = NULL; - - /* - * Are we cloning a kernel thread? - * - * We need to steal a active VM for that.. - */ - oldmm = current->mm; - if (!oldmm) - return 0; - - if (clone_flags & CLONE_VM) { - atomic_inc(&oldmm->mm_users); - mm = oldmm; - goto good_mm; - } - - retval = -ENOMEM; - mm = dup_mm(tsk); - if (!mm) - goto fail_nomem; - -good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - - tsk->mm = mm; - tsk->active_mm = mm; - return 0; - -fail_nomem: - return retval; -} - -static struct fs_struct *__copy_fs_struct(struct fs_struct *old) -{ - struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); - /* We don't need to lock fs - think why ;-) */ - if (fs) { - atomic_set(&fs->count, 1); - rwlock_init(&fs->lock); - fs->umask = old->umask; - read_lock(&old->lock); - fs->root = old->root; - path_get(&old->root); - fs->pwd = old->pwd; - path_get(&old->pwd); - read_unlock(&old->lock); - } - return fs; -} - -struct fs_struct *copy_fs_struct(struct fs_struct *old) -{ - return __copy_fs_struct(old); -} - -EXPORT_SYMBOL_GPL(copy_fs_struct); - -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) -{ - if (clone_flags & CLONE_FS) { - atomic_inc(¤t->fs->count); - return 0; - } - tsk->fs = __copy_fs_struct(current->fs); - if (!tsk->fs) - return -ENOMEM; - return 0; -} - -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) -{ - struct files_struct *oldf, *newf; - int error = 0; - - /* - * A background process may not have any files ... - */ - oldf = current->files; - if (!oldf) - goto out; - - if (clone_flags & CLONE_FILES) { - atomic_inc(&oldf->count); - goto out; - } - - newf = dup_fd(oldf, &error); - if (!newf) - goto out; - - tsk->files = newf; - error = 0; -out: - return error; -} - -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; - } else if (ioprio_valid(ioc->ioprio)) { - tsk->io_context = alloc_io_context(GFP_KERNEL, -1); - if (unlikely(!tsk->io_context)) - return -ENOMEM; - - tsk->io_context->ioprio = ioc->ioprio; - } -#endif - return 0; -} - -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) -{ - struct sighand_struct *sig; - - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { - atomic_inc(¤t->sighand->count); - return 0; - } - sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); - if (!sig) - return -ENOMEM; - atomic_set(&sig->count, 1); - memcpy(sig->action, current->sighand->action, sizeof(sig->action)); - return 0; -} - -void __cleanup_sighand(struct sighand_struct *sighand) -{ - if (atomic_dec_and_test(&sighand->count)) - kmem_cache_free(sighand_cachep, sighand); -} - -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) -{ - struct signal_struct *sig; - int ret; - - if (clone_flags & CLONE_THREAD) - return 0; - - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); - tsk->signal = sig; - if (!sig) - return -ENOMEM; - - ret = copy_thread_group_keys(tsk); - if (ret < 0) { - kmem_cache_free(signal_cachep, sig); - return ret; - } - - atomic_set(&sig->count, 1); - atomic_set(&sig->live, 1); - init_waitqueue_head(&sig->wait_chldexit); - sig->flags = 0; - sig->group_exit_code = 0; - sig->group_exit_task = NULL; - sig->group_stop_count = 0; - sig->curr_target = tsk; - init_sigpending(&sig->shared_pending); - INIT_LIST_HEAD(&sig->posix_timers); - - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->it_real_incr.tv64 = 0; - sig->real_timer.function = it_real_fn; - - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - - sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = NULL; - - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; - sig->gtime = cputime_zero; - sig->cgtime = cputime_zero; - sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; - sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; - sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); - taskstats_tgid_init(sig); - - task_lock(current->group_leader); - memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); - task_unlock(current->group_leader); - - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } - acct_init_pacct(&sig->pacct); - - tty_audit_fork(sig); - - return 0; -} - -void __cleanup_signal(struct signal_struct *sig) -{ - exit_thread_group_keys(sig); - kmem_cache_free(signal_cachep, sig); -} - -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~PF_SUPERPRIV; - new_flags |= PF_FORKNOEXEC; - new_flags |= PF_STARTING; - p->flags = new_flags; - clear_freeze_flag(p); -} - -SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) -{ - current->clear_child_tid = tidptr; - - return task_pid_vnr(current); -} - -static void rt_mutex_init_task(struct task_struct *p) -{ - spin_lock_init(&p->pi_lock); -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters, &p->pi_lock); - p->pi_blocked_on = NULL; -#endif -} - -#ifdef CONFIG_MM_OWNER -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MM_OWNER */ - -/* - * This creates a new process as a copy of the old one, - * but does not actually start it yet. - * - * It copies the registers, and all the appropriate - * parts of the process environment (as per the clone - * flags). The actual kick-off is left to the caller. - */ -static struct task_struct *copy_process(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *child_tidptr, - struct pid *pid, - int trace) -{ - int retval; - struct task_struct *p; - int cgroup_callbacks_done = 0; - - if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) - return ERR_PTR(-EINVAL); - - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. - */ - if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) - return ERR_PTR(-EINVAL); - - /* - * Shared signal handlers imply shared VM. By way of the above, - * thread groups also imply shared VM. Blocking this case allows - * for various simplifications in other code. - */ - if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) - return ERR_PTR(-EINVAL); - - retval = security_task_create(clone_flags); - if (retval) - goto fork_out; - - retval = -ENOMEM; - p = dup_task_struct(current); - if (!p) - goto fork_out; - - rt_mutex_init_task(p); - -#ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); - DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); -#endif - retval = -EAGAIN; - if (atomic_read(&p->user->processes) >= - p->signal->rlim[RLIMIT_NPROC].rlim_cur) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->user != current->nsproxy->user_ns->root_user) - goto bad_fork_free; - } - - atomic_inc(&p->user->__count); - atomic_inc(&p->user->processes); - get_group_info(p->group_info); - - /* - * If multiple threads are within copy_process(), then this check - * triggers too late. This doesn't hurt, the check is only there - * to stop root fork bombs. - */ - if (nr_threads >= max_threads) - goto bad_fork_cleanup_count; - - if (!try_module_get(task_thread_info(p)->exec_domain->module)) - goto bad_fork_cleanup_count; - - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - - p->did_exec = 0; - delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); - INIT_LIST_HEAD(&p->children); - INIT_LIST_HEAD(&p->sibling); -#ifdef CONFIG_PREEMPT_RCU - p->rcu_read_lock_nesting = 0; - p->rcu_flipctr_idx = 0; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - p->vfork_done = NULL; - spin_lock_init(&p->alloc_lock); - - clear_tsk_thread_flag(p, TIF_SIGPENDING); - init_sigpending(&p->pending); - - p->utime = cputime_zero; - p->stime = cputime_zero; - p->gtime = cputime_zero; - p->utimescaled = cputime_zero; - p->stimescaled = cputime_zero; - p->prev_utime = cputime_zero; - p->prev_stime = cputime_zero; - -#ifdef CONFIG_DETECT_SOFTLOCKUP - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - - task_io_accounting_init(&p->ioac); - acct_clear_integrals(p); - - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); - - p->lock_depth = -1; /* -1 = no lock */ - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); -#ifdef CONFIG_SECURITY - p->security = NULL; -#endif - p->cap_bset = current->cap_bset; - p->io_context = NULL; - p->audit_context = NULL; - cgroup_fork(p); -#ifdef CONFIG_NUMA - p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } - mpol_fix_fork_child_flag(p); -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else - p->hardirqs_enabled = 0; -#endif - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->hardirq_context = 0; - p->softirq_context = 0; -#endif -#ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; -#endif - -#ifdef CONFIG_DEBUG_MUTEXES - p->blocked_on = NULL; /* not blocked yet */ -#endif - - /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); - - if ((retval = security_task_alloc(p))) - goto bad_fork_cleanup_policy; - if ((retval = audit_alloc(p))) - goto bad_fork_cleanup_security; - /* copy all the process information */ - if ((retval = copy_semundo(clone_flags, p))) - goto bad_fork_cleanup_audit; - if ((retval = copy_files(clone_flags, p))) - goto bad_fork_cleanup_semundo; - if ((retval = copy_fs(clone_flags, p))) - goto bad_fork_cleanup_files; - if ((retval = copy_sighand(clone_flags, p))) - goto bad_fork_cleanup_fs; - if ((retval = copy_signal(clone_flags, p))) - goto bad_fork_cleanup_sighand; - if ((retval = copy_mm(clone_flags, p))) - goto bad_fork_cleanup_signal; - if ((retval = copy_keys(clone_flags, p))) - goto bad_fork_cleanup_mm; - if ((retval = copy_namespaces(clone_flags, p))) - goto bad_fork_cleanup_keys; - if ((retval = copy_io(clone_flags, p))) - goto bad_fork_cleanup_namespaces; - retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); - if (retval) - goto bad_fork_cleanup_io; - - if (pid != &init_struct_pid) { - retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); - if (!pid) - goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(task_active_pid_ns(p)); - if (retval < 0) - goto bad_fork_free_pid; - } - } - - p->pid = pid_nr(pid); - p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) - p->tgid = current->tgid; - - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif - /* - * sigaltstack should be cleared when sharing the same VM - */ - if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; - - /* - * Syscall tracing should be turned off in the child regardless - * of CLONE_PTRACE. - */ - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); -#endif - clear_all_latency_tracing(p); - - /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); - p->pdeath_signal = 0; - p->exit_state = 0; - - /* - * Ok, make it visible to the rest of the system. - * We dont wake it up yet. - */ - p->group_leader = p; - INIT_LIST_HEAD(&p->thread_group); - - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - - /* Need tasklist lock for parent etc handling! */ - write_lock_irq(&tasklist_lock); - - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - - /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { - p->real_parent = current->real_parent; - p->parent_exec_id = current->parent_exec_id; - } else { - p->real_parent = current; - p->parent_exec_id = current->self_exec_id; - } - - spin_lock(¤t->sighand->siglock); - - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; - } - - if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - p->group_leader = current->group_leader; - list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } - } - - if (likely(p->pid)) { - list_add_tail(&p->sibling, &p->real_parent->children); - tracehook_finish_clone(p, clone_flags, trace); - - if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) - p->nsproxy->pid_ns->child_reaper = p; - - p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; - set_task_pgrp(p, task_pgrp_nr(current)); - set_task_session(p, task_session_nr(current)); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); - list_add_tail_rcu(&p->tasks, &init_task.tasks); - __get_cpu_var(process_counts)++; - } - attach_pid(p, PIDTYPE_PID, pid); - nr_threads++; - } - - total_forks++; - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - proc_fork_connector(p); - cgroup_post_fork(p); - return p; - -bad_fork_free_pid: - if (pid != &init_struct_pid) - free_pid(pid); -bad_fork_cleanup_io: - put_io_context(p->io_context); -bad_fork_cleanup_namespaces: - exit_task_namespaces(p); -bad_fork_cleanup_keys: - exit_keys(p); -bad_fork_cleanup_mm: - if (p->mm) - mmput(p->mm); -bad_fork_cleanup_signal: - if (!(clone_flags & CLONE_THREAD)) - __cleanup_signal(p->signal); -bad_fork_cleanup_sighand: - __cleanup_sighand(p->sighand); -bad_fork_cleanup_fs: - exit_fs(p); /* blocking */ -bad_fork_cleanup_files: - exit_files(p); /* blocking */ -bad_fork_cleanup_semundo: - exit_sem(p); -bad_fork_cleanup_audit: - audit_free(p); -bad_fork_cleanup_security: - security_task_free(p); -bad_fork_cleanup_policy: -#ifdef CONFIG_NUMA - mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: -#endif - cgroup_exit(p, cgroup_callbacks_done); - delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: - module_put(task_thread_info(p)->exec_domain->module); -bad_fork_cleanup_count: - put_group_info(p->group_info); - atomic_dec(&p->user->processes); - free_uid(p->user); -bad_fork_free: - free_task(p); -fork_out: - return ERR_PTR(retval); -} - -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - return regs; -} - -struct task_struct * __cpuinit fork_idle(int cpu) -{ - struct task_struct *task; - struct pt_regs regs; - - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); - if (!IS_ERR(task)) - init_idle(task, cpu); - - return task; -} - -/* - * Ok, this is the main fork-routine. - * - * It copies the process, and if successful kick-starts - * it and waits for it to finish using the VM if required. - */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct task_struct *p; - int trace = 0; - long nr; - - /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* - * When called from kernel_thread, don't do user tracing stuff. - */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); - - p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL, trace); - /* - * Do this prior waking up the new thread - the thread pointer - * might get invalid after that point, if the thread exits quickly. - */ - if (!IS_ERR(p)) { - struct completion vfork; - - nr = task_pid_vnr(p); - - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); - - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - } - - tracehook_report_clone(trace, regs, clone_flags, nr, p); - - /* - * We set PF_STARTING at creation in case tracing wants to - * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. - */ - p->flags &= ~PF_STARTING; - - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } - - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); - - if (clone_flags & CLONE_VFORK) { - freezer_do_not_count(); - wait_for_completion(&vfork); - freezer_count(); - tracehook_report_vfork_done(p, nr); - } - } else { - nr = PTR_ERR(p); - } - return nr; -} - -#ifndef ARCH_MIN_MMSTRUCT_ALIGN -#define ARCH_MIN_MMSTRUCT_ALIGN 0 -#endif - -static void sighand_ctor(void *data) -{ - struct sighand_struct *sighand = data; - - spin_lock_init(&sighand->siglock); - init_waitqueue_head(&sighand->signalfd_wqh); -} - -void __init proc_caches_init(void) -{ - sighand_cachep = kmem_cache_create("sighand_cache", - sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, - sighand_ctor); - signal_cachep = kmem_cache_create("signal_cache", - sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - files_cachep = kmem_cache_create("files_cache", - sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - fs_cachep = kmem_cache_create("fs_cache", - sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); - mm_cachep = kmem_cache_create("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} - -/* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. - */ -static void check_unshare_flags(unsigned long *flags_ptr) -{ - /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing signal handlers and the task was created - * using CLONE_THREAD, then must unshare the thread - */ - if ((*flags_ptr & CLONE_SIGHAND) && - (atomic_read(¤t->signal->count) > 1)) - *flags_ptr |= CLONE_THREAD; - - /* - * If unsharing namespace, must also unshare filesystem information. - */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; - - return 0; -} - -/* - * Unshare the filesystem structure if it is being shared - */ -static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) -{ - struct fs_struct *fs = current->fs; - - if ((unshare_flags & CLONE_FS) && - (fs && atomic_read(&fs->count) > 1)) { - *new_fsp = __copy_fs_struct(current->fs); - if (!*new_fsp) - return -ENOMEM; - } - - return 0; -} - -/* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - -/* - * Unshare file descriptor table if it is being shared - */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) -{ - struct files_struct *fd = current->files; - int error = 0; - - if ((unshare_flags & CLONE_FILES) && - (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); - if (!*new_fdp) - return error; - } - - return 0; -} - -/* - * unshare allows a process to 'unshare' part of the process - * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly - * because they modify an inactive task_struct that is being - * constructed. Here we are modifying the current, active, - * task_struct. - */ -SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) -{ - int err = 0; - struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; - struct files_struct *fd, *new_fd = NULL; - struct nsproxy *new_nsproxy = NULL; - int do_sysvsem = 0; - - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| - CLONE_NEWNET)) - goto bad_unshare_out; - - /* - * CLONE_NEWIPC must also detach from the undolist: after switching - * to a new ipc namespace, the semaphore arrays from the old - * namespace are unreachable. - */ - if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) - do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; - if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; - if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; - if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_fs))) - goto bad_unshare_cleanup_fd; - - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { - if (do_sysvsem) { - /* - * CLONE_SYSVSEM is equivalent to sys_exit(). - */ - exit_sem(current); - } - - if (new_nsproxy) { - switch_task_namespaces(current, new_nsproxy); - new_nsproxy = NULL; - } - - task_lock(current); - - if (new_fs) { - fs = current->fs; - current->fs = new_fs; - new_fs = fs; - } - - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - activate_mm(active_mm, new_mm); - new_mm = mm; - } - - if (new_fd) { - fd = current->files; - current->files = new_fd; - new_fd = fd; - } - - task_unlock(current); - } - - if (new_nsproxy) - put_nsproxy(new_nsproxy); - -bad_unshare_cleanup_fd: - if (new_fd) - put_files_struct(new_fd); - -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - -bad_unshare_cleanup_fs: - if (new_fs) - put_fs_struct(new_fs); - -bad_unshare_cleanup_thread: -bad_unshare_out: - return err; -} - -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(struct files_struct **displaced) -{ - struct task_struct *task = current; - struct files_struct *copy = NULL; - int error; - - error = unshare_fd(CLONE_FILES, ©); - if (error || !copy) { - *displaced = NULL; - return error; - } - *displaced = task->files; - task_lock(task); - task->files = copy; - task_unlock(task); - return 0; -} -/* - * Fast Userspace Mutexes (which I call "Futexes!"). - * (C) Rusty Russell, IBM 2002 - * - * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar - * (C) Copyright 2003 Red Hat Inc, All Rights Reserved - * - * Removed page pinning, fix privately mapped COW pages and other cleanups - * (C) Copyright 2003, 2004 Jamie Lokier - * - * Robust futex support started by Ingo Molnar - * (C) Copyright 2006 Red Hat Inc, All Rights Reserved - * Thanks to Thomas Gleixner for suggestions, analysis and fixes. - * - * PI-futex support started by Ingo Molnar and Thomas Gleixner - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * PRIVATE futexes by Eric Dumazet - * Copyright (C) 2007 Eric Dumazet - * - * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly - * enough at me, Linus for the original (flawed) idea, Matthew - * Kirkwood for proof-of-concept implementation. - * - * "The futexes are also cursed." - * "But they come in a choice of three flavours!" - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "rtmutex_common.h" - -int __read_mostly futex_cmpxchg_enabled; - -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) - -/* - * Priority Inheritance state: - */ -struct futex_pi_state { - /* - * list of 'owned' pi_state instances - these have to be - * cleaned up in do_exit() if the task exits prematurely: - */ - struct list_head list; - - /* - * The PI object: - */ - struct rt_mutex pi_mutex; - - struct task_struct *owner; - atomic_t refcount; - - union futex_key key; -}; - -/* - * We use this hashed waitqueue instead of a normal wait_queue_t, so - * we can wake only the relevant ones (hashed queues may be shared). - * - * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then - * wake up q->waiters, then make the second condition true. - */ -struct futex_q { - struct plist_node list; - wait_queue_head_t waiters; - - /* Which hash list lock to use: */ - spinlock_t *lock_ptr; - - /* Key which the futex is hashed on: */ - union futex_key key; - - /* Optional priority inheritance state: */ - struct futex_pi_state *pi_state; - struct task_struct *task; - - /* Bitset for the optional bitmasked wakeup */ - u32 bitset; -}; - -/* - * Split the global futex_lock into every hash list lock. - */ -struct futex_hash_bucket { - spinlock_t lock; - struct plist_head chain; -}; - -static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared - */ -static inline void futex_lock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - down_read(fshared); -} - -/* - * Release mm->mmap_sem, when the futex is shared - */ -static inline void futex_unlock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - up_read(fshared); -} - -/* - * We hash on the keys returned from get_futex_key (see below). - */ -static struct futex_hash_bucket *hash_futex(union futex_key *key) -{ - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, - key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; -} - -/* - * Return 1 if two futex_keys are equal, 0 otherwise. - */ -static inline int match_futex(union futex_key *key1, union futex_key *key2) -{ - return (key1->both.word == key2->both.word - && key1->both.ptr == key2->both.ptr - && key1->both.offset == key2->both.offset); -} - -/** - * get_futex_key - Get parameters which are the keys for a futex. - * @uaddr: virtual address of the futex - * @shared: NULL for a PROCESS_PRIVATE futex, - * ¤t->mm->mmap_sem for a PROCESS_SHARED futex - * @key: address where result is stored. - * - * Returns a negative error code or 0 - * The key words are stored in *key on success. - * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. - * - * fshared is NULL for PROCESS_PRIVATE futexes - * For other futexes, it points to ¤t->mm->mmap_sem and - * caller must have taken the reader lock. but NOT any spinlocks. - */ -static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, - union futex_key *key) -{ - unsigned long address = (unsigned long)uaddr; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct page *page; - int err; - - /* - * The futex address must be "naturally" aligned. - */ - key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) - return -EINVAL; - address -= key->both.offset; - - /* - * PROCESS_PRIVATE futexes are fast. - * As the mm cannot disappear under us and the 'key' only needs - * virtual address, we dont even have to find the underlying vma. - * Note : We do have to check 'uaddr' is a valid user address, - * but access_ok() should be faster than find_vma() - */ - if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; - key->private.mm = mm; - key->private.address = address; - return 0; - } - /* - * The futex is hashed differently depending on whether - * it's in a shared or private mapping. So check vma first. - */ - vma = find_extend_vma(mm, address); - if (unlikely(!vma)) - return -EFAULT; - - /* - * Permissions. - */ - if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) - return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; - - /* - * Private mappings are handled in a simple way. - * - * NOTE: When userspace waits on a MAP_SHARED mapping, even if - * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. Therefore we use - * VM_MAYSHARE here, not VM_SHARED which is restricted to shared - * mappings of _writable_ handles. - */ - if (likely(!(vma->vm_flags & VM_MAYSHARE))) { - key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ - key->private.mm = mm; - key->private.address = address; - return 0; - } - - /* - * Linear file mappings are also simple. - */ - key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - + vma->vm_pgoff); - return 0; - } - - /* - * We could walk the page table to read the non-linear - * pte, and get the page index without fetching the page - * from swap. But that's a lot of code to duplicate here - * for a rare case, so we simply fetch the page. - */ - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - if (err >= 0) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - put_page(page); - return 0; - } - return err; -} - -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (key->both.ptr == NULL) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); - break; - case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); - break; - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) -{ - u32 curval; - - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - pagefault_enable(); - - return curval; -} - -static int get_futex_value_locked(u32 *dest, u32 __user *from) -{ - int ret; - - pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); - pagefault_enable(); - - return ret ? -EFAULT : 0; -} - -/* - * Fault handling. - * if fshared is non NULL, current->mm->mmap_sem is already held - */ -static int futex_handle_fault(unsigned long address, - struct rw_semaphore *fshared, int attempt) -{ - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - int ret = -EFAULT; - - if (attempt > 2) - return ret; - - if (!fshared) - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - if (vma && address >= vma->vm_start && - (vma->vm_flags & VM_WRITE)) { - int fault; - fault = handle_mm_fault(mm, vma, address, 1); - if (unlikely((fault & VM_FAULT_ERROR))) { -#if 0 - /* XXX: let's do this when we verify it is OK */ - if (ret & VM_FAULT_OOM) - ret = -ENOMEM; -#endif - } else { - ret = 0; - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - } - } - if (!fshared) - up_read(&mm->mmap_sem); - return ret; -} - -/* - * PI code: - */ -static int refill_pi_state_cache(void) -{ - struct futex_pi_state *pi_state; - - if (likely(current->pi_state_cache)) - return 0; - - pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); - - if (!pi_state) - return -ENOMEM; - - INIT_LIST_HEAD(&pi_state->list); - /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - - current->pi_state_cache = pi_state; - - return 0; -} - -static struct futex_pi_state * alloc_pi_state(void) -{ - struct futex_pi_state *pi_state = current->pi_state_cache; - - WARN_ON(!pi_state); - current->pi_state_cache = NULL; - - return pi_state; -} - -static void free_pi_state(struct futex_pi_state *pi_state) -{ - if (!atomic_dec_and_test(&pi_state->refcount)) - return; - - /* - * If pi_state->owner is NULL, the owner is most probably dying - * and has cleaned up the pi_state already - */ - if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); - } - - if (current->pi_state_cache) - kfree(pi_state); - else { - /* - * pi_state->list is already empty. - * clear pi_state->owner. - * refcount is at 0 - put it back to 1. - */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; - } -} - -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct * futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - - rcu_read_unlock(); - - return p; -} - -/* - * This task is holding PI mutexes at exit time => bad. - * Kernel cleans up PI-state, but userspace is likely hosed. - * (Robust-futex cleanup is separate and might save the day for userspace.) - */ -void exit_pi_state_list(struct task_struct *curr) -{ - struct list_head *next, *head = &curr->pi_state_list; - struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; - union futex_key key; - - if (!futex_cmpxchg_enabled) - return; - /* - * We are a ZOMBIE and nobody can enqueue itself on - * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselves: - */ - spin_lock_irq(&curr->pi_lock); - while (!list_empty(head)) { - - next = head->next; - pi_state = list_entry(next, struct futex_pi_state, list); - key = pi_state->key; - hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); - - spin_lock(&hb->lock); - - spin_lock_irq(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - spin_unlock(&hb->lock); - continue; - } - - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); - - rt_mutex_unlock(&pi_state->pi_mutex); - - spin_unlock(&hb->lock); - - spin_lock_irq(&curr->pi_lock); - } - spin_unlock_irq(&curr->pi_lock); -} - -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) -{ - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct plist_head *head; - struct task_struct *p; - pid_t pid = uval & FUTEX_TID_MASK; - - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex(&this->key, key)) { - /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non PI and PI futexes - */ - if (unlikely(!pi_state)) - return -EINVAL; - - WARN_ON(!atomic_read(&pi_state->refcount)); - - /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. - */ - if (pid && pi_state->owner) { - /* - * Bail out if user space manipulated the - * futex value. - */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - } - - atomic_inc(&pi_state->refcount); - *ps = pi_state; - - return 0; - } - } - - /* - * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 - */ - if (!pid) - return -ESRCH; - p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); - - /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: - */ - spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { - /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: - */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - - spin_unlock_irq(&p->pi_lock); - put_task_struct(p); - return ret; - } - - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make 'p' - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); - - put_task_struct(p); - - *ps = pi_state; - - return 0; -} - -/* - * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. - */ -static void wake_futex(struct futex_q *q) -{ - plist_del(&q->list, &q->list.plist); - /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. - */ - wake_up_all(&q->waiters); - /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store - * to lock_ptr from getting ahead of the wakeup. Clearing the lock - * at the end of wake_up_all() does not prevent this store from - * moving. - */ - smp_wmb(); - q->lock_ptr = NULL; -} - -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) -{ - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; - u32 curval, newval; - - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - spin_lock(&pi_state->pi_mutex.wait_lock); - new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - - /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) - */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } - } - - spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - - spin_lock_irq(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); - - spin_unlock(&pi_state->pi_mutex.wait_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - - return 0; -} - -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 oldval; - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - -/* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: - */ -static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake, u32 bitset) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - struct plist_head *head; - union futex_key key; - int ret; - - if (!bitset) - return -EINVAL; - - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr, fshared, &key); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); - spin_lock(&hb->lock); - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key)) { - if (this->pi_state) { - ret = -EINVAL; - break; - } - - /* Check if one of the bits is set in both bitsets */ - if (!(this->bitset & bitset)) - continue; - - wake_futex(this); - if (++ret >= nr_wake) - break; - } - } - - spin_unlock(&hb->lock); -out: - futex_unlock_mm(fshared); - return ret; -} - -/* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: - */ -static int -futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, - int nr_wake, int nr_wake2, int op) -{ - union futex_key key1, key2; - struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; - struct futex_q *this, *next; - int ret, op_ret, attempt = 0; - -retryfull: - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr1, fshared, &key1); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, fshared, &key2); - if (unlikely(ret != 0)) - goto out; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry: - double_lock_hb(hb1, hb2); - - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - u32 dummy; - - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); - -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out; -#endif - - if (unlikely(op_ret != -EFAULT)) { - ret = op_ret; - goto out; - } - - /* - * futex_atomic_op_inuser needs to both read and write - * *(int __user *)uaddr2, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); - if (ret) - goto out; - goto retry; - } - - /* - * If we would have faulted, release mmap_sem, - * fault it in and start all over again. - */ - futex_unlock_mm(fshared); - - ret = get_user(dummy, uaddr2); - if (ret) - return ret; - - goto retryfull; - } - - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key1)) { - wake_futex(this); - if (++ret >= nr_wake) - break; - } - } - - if (op_ret > 0) { - head = &hb2->chain; - - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key2)) { - wake_futex(this); - if (++op_ret >= nr_wake2) - break; - } - } - ret += op_ret; - } - - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); -out: - futex_unlock_mm(fshared); - - return ret; -} - -/* - * Requeue all waiters hashed on one physical page to another - * physical page. - */ -static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) -{ - union futex_key key1, key2; - struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; - struct futex_q *this, *next; - int ret, drop_count = 0; - - retry: - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr1, fshared, &key1); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, fshared, &key2); - if (unlikely(ret != 0)) - goto out; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - - double_lock_hb(hb1, hb2); - - if (likely(cmpval != NULL)) { - u32 curval; - - ret = get_futex_value_locked(&curval, uaddr1); - - if (unlikely(ret)) { - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); - - /* - * If we would have faulted, release mmap_sem, fault - * it in and start all over again. - */ - futex_unlock_mm(fshared); - - ret = get_user(curval, uaddr1); - - if (!ret) - goto retry; - - return ret; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } - - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) - continue; - if (++ret <= nr_wake) { - wake_futex(this); - } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); - drop_count++; - - if (ret - nr_wake >= nr_requeue) - break; - } - } - -out_unlock: - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); - - /* drop_futex_key_refs() must be called outside the spinlocks. */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - -out: - futex_unlock_mm(fshared); - return ret; -} - -/* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) -{ - struct futex_hash_bucket *hb; - - init_waitqueue_head(&q->waiters); - - get_futex_key_refs(&q->key); - hb = hash_futex(&q->key); - q->lock_ptr = &hb->lock; - - spin_lock(&hb->lock); - return hb; -} - -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -{ - int prio; - - /* - * The priority used to register this element is - * - either the real thread-priority for the real-time threads - * (i.e. threads with a priority lower than MAX_RT_PRIO) - * - or MAX_RT_PRIO for non-RT threads. - * Thus, all RT-threads are woken first in priority order, and - * the others are woken last, in FIFO order. - */ - prio = min(current->normal_prio, MAX_RT_PRIO); - - plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; -#endif - plist_add(&q->list, &hb->chain); - q->task = current; - spin_unlock(&hb->lock); -} - -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) -{ - spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); -} - -/* - * queue_me and unqueue_me must be called as a pair, each - * exactly once. They are called with the hashed spinlock held. - */ - -/* Return 1 if we were still queued (ie. 0 means we were woken) */ -static int unqueue_me(struct futex_q *q) -{ - spinlock_t *lock_ptr; - int ret = 0; - - /* In the common case we don't take the spinlock, which is nice. */ - retry: - lock_ptr = q->lock_ptr; - barrier(); - if (lock_ptr != NULL) { - spin_lock(lock_ptr); - /* - * q->lock_ptr can change between reading it and - * spin_lock(), causing us to take the wrong lock. This - * corrects the race condition. - * - * Reasoning goes like this: if we have the wrong lock, - * q->lock_ptr must have changed (maybe several times) - * between reading it and the spin_lock(). It can - * change again after the spin_lock() but only if it was - * already changed before the spin_lock(). It cannot, - * however, change back to the original value. Therefore - * we can detect whether we acquired the correct lock. - */ - if (unlikely(lock_ptr != q->lock_ptr)) { - spin_unlock(lock_ptr); - goto retry; - } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); - - BUG_ON(q->pi_state); - - spin_unlock(lock_ptr); - ret = 1; - } - - drop_futex_key_refs(&q->key); - return ret; -} - -/* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry - * and dropped here. - */ -static void unqueue_me_pi(struct futex_q *q) -{ - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); - - BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); - q->pi_state = NULL; - - spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); -} - -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, - struct rw_semaphore *fshared) -{ - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; - struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; - u32 uval, curval, newval; - int ret, attempt = 0; - - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; - - /* - * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. - * - * Note: We write the user space value _before_ changing the - * pi_state because we can fault here. Imagine swapped out - * pages or a fork, which was running right before we acquired - * mmap_sem, that marked all the anonymous memory readonly for - * cow. - * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. - */ -retry: - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; - - while (1) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) - goto handle_fault; - if (curval == uval) - break; - uval = curval; - } - - /* - * We fixed up user space. Now we need to fix the pi_state - * itself. - */ - if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; - - spin_lock_irq(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); - return 0; - - /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. - */ -handle_fault: - spin_unlock(q->lock_ptr); - - ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - - spin_lock(q->lock_ptr); - - /* - * Check if someone else fixed it for us: - */ - if (pi_state->owner != oldowner) - return 0; - - if (ret) - return ret; - - goto retry; -} - -/* - * In case we must use restart_block to restart a futex_wait, - * we encode in the 'flags' shared capability - */ -#define FLAGS_SHARED 1 - -static long futex_wait_restart(struct restart_block *restart); - -static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, - u32 val, ktime_t *abs_time, u32 bitset) -{ - struct task_struct *curr = current; - DECLARE_WAITQUEUE(wait, curr); - struct futex_hash_bucket *hb; - struct futex_q q; - u32 uval; - int ret; - struct hrtimer_sleeper t; - int rem = 0; - - if (!bitset) - return -EINVAL; - - q.pi_state = NULL; - q.bitset = bitset; - retry: - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) - goto out_release_sem; - - hb = queue_lock(&q); - - /* - * Access the page AFTER the futex is queued. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. - * - * for shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. - */ - ret = get_futex_value_locked(&uval, uaddr); - - if (unlikely(ret)) { - queue_unlock(&q, hb); - - /* - * If we would have faulted, release mmap_sem, fault it in and - * start all over again. - */ - futex_unlock_mm(fshared); - - ret = get_user(uval, uaddr); - - if (!ret) - goto retry; - return ret; - } - ret = -EWOULDBLOCK; - if (uval != val) - goto out_unlock_release_sem; - - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); - - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiters, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - t.timer.expires = *abs_time; - - hrtimer_start(&t.timer, t.timer.expires, - HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - - /* Flag if a timeout occured */ - rem = (t.task == NULL); - - destroy_hrtimer_on_stack(&t.timer); - } - } - __set_current_state(TASK_RUNNING); - - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ - - /* If we were woken (and unqueued), we succeeded, whatever. */ - if (!unqueue_me(&q)) - return 0; - if (rem) - return -ETIMEDOUT; - - /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. - */ - if (!abs_time) - return -ERESTARTSYS; - else { - struct restart_block *restart; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; - restart->futex.flags = 0; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - return -ERESTART_RESTARTBLOCK; - } - - out_unlock_release_sem: - queue_unlock(&q, hb); - - out_release_sem: - futex_unlock_mm(fshared); - return ret; -} - - -static long futex_wait_restart(struct restart_block *restart) -{ - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - struct rw_semaphore *fshared = NULL; - ktime_t t; - - t.tv64 = restart->futex.time; - restart->fn = do_no_restart_syscall; - if (restart->futex.flags & FLAGS_SHARED) - fshared = ¤t->mm->mmap_sem; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, - restart->futex.bitset); -} - - -/* - * Userspace tried a 0 -> TID atomic transition of the futex value - * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) - */ -static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, - int detect, ktime_t *time, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; - struct futex_hash_bucket *hb; - u32 uval, newval, curval; - struct futex_q q; - int ret, lock_taken, ownerdied = 0, attempt = 0; - - if (refill_pi_state_cache()) - return -ENOMEM; - - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires = *time; - } - - q.pi_state = NULL; - retry: - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) - goto out_release_sem; - - retry_unlocked: - hb = queue_lock(&q); - - retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_release_sem; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_release_sem; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_release_sem; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - - if (unlikely(ret)) { - switch (ret) { - - case -EAGAIN: - /* - * Task is exiting and we just wait for the - * exit to complete. - */ - queue_unlock(&q, hb); - futex_unlock_mm(fshared); - cond_resched(); - goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } - default: - goto out_unlock_release_sem; - } - } - - /* - * Only actually queue now that the atomic ops are done: - */ - queue_me(&q, hb); - - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - } - - futex_lock_mm(fshared); - spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q); - futex_unlock_mm(fshared); - - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; - - out_unlock_release_sem: - queue_unlock(&q, hb); - - out_release_sem: - futex_unlock_mm(fshared); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret; - - uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - * - * ... and hb->lock. :-) --ANK - */ - queue_unlock(&q, hb); - - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); - if (ret) - goto out_release_sem; - goto retry_unlocked; - } - - futex_unlock_mm(fshared); - - ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) - goto retry; - - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret; -} - -/* - * Userspace attempted a TID -> 0 atomic transition, and failed. - * This is the in-kernel slowpath: we look up the PI state (if any), - * and do the rt-mutex unlock. - */ -static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - u32 uval; - struct plist_head *head; - union futex_key key; - int ret, attempt = 0; - -retry: - if (get_user(uval, uaddr)) - return -EFAULT; - /* - * We release only a lock we actually own: - */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) - return -EPERM; - /* - * First take all the futex related locks: - */ - futex_lock_mm(fshared); - - ret = get_futex_key(uaddr, fshared, &key); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); -retry_unlocked: - spin_lock(&hb->lock); - - /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up: - */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); - - - if (unlikely(uval == -EFAULT)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == task_pid_vnr(current))) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: - */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); - /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: - */ - if (ret == -EFAULT) - goto pi_faulted; - goto out_unlock; - } - /* - * No waiters - kernel unlocks the futex: - */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } - -out_unlock: - spin_unlock(&hb->lock); -out: - futex_unlock_mm(fshared); - - return ret; - -pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - * - * ... and hb->lock. --ANK - */ - spin_unlock(&hb->lock); - - if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); - if (ret) - goto out; - uval = 0; - goto retry_unlocked; - } - - futex_unlock_mm(fshared); - - ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) - goto retry; - - return ret; -} - -/* - * Support for robust futexes: the kernel cleans up held futexes at - * thread exit time. - * - * Implementation: user-space maintains a per-thread list of locks it - * is holding. Upon do_exit(), the kernel carefully walks this list, - * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is - * always manipulated with the lock held, so the list is private and - * per-thread. Userspace also maintains a per-thread 'list_op_pending' - * field, to allow the kernel to clean up if the thread dies after - * acquiring the lock, but just before it could have added itself to - * the list. There can only be one such pending lock. - */ - -/** - * sys_set_robust_list - set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects - */ -SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, - size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - /* - * The kernel knows only one size for now: - */ - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->robust_list = head; - - return 0; -} - -/** - * sys_get_robust_list - get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size - */ -SYSCALL_DEFINE3(get_robust_list, int, pid, - struct robust_list_head __user * __user *, head_ptr, - size_t __user *, len_ptr) -{ - struct robust_list_head __user *head; - unsigned long ret; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (!pid) - head = current->robust_list; - else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_PTRACE)) - goto err_unlock; - head = p->robust_list; - rcu_read_unlock(); - } - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -/* - * Process a futex-list entry, check whether it's owned by the - * dying task, and do notification if so: - */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) -{ - u32 uval, nval, mval; - -retry: - if (get_user(uval, uaddr)) - return -1; - - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - - if (nval != uval) - goto retry; - - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1, - FUTEX_BITSET_MATCH_ANY); - } - return 0; -} - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user * __user *head, - int *pi) -{ - unsigned long uentry; - - if (get_user(uentry, (unsigned long __user *)head)) - return -EFAULT; - - *entry = (void __user *)(uentry & ~1UL); - *pi = uentry & 1; - - return 0; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void exit_robust_list(struct task_struct *curr) -{ - struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; - unsigned long futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * don't process it twice: - */ - if (entry != pending) - if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi)) - return; - if (rc) - return; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); -} - -long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3) -{ - int ret = -ENOSYS; - int cmd = op & FUTEX_CMD_MASK; - struct rw_semaphore *fshared = NULL; - - if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = ¤t->mm->mmap_sem; - - switch (cmd) { - case FUTEX_WAIT: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3); - break; - case FUTEX_WAKE: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, fshared, val, val3); - break; - case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); - break; - case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); - break; - case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); - break; - case FUTEX_LOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); - break; - case FUTEX_UNLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, fshared); - break; - case FUTEX_TRYLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); - break; - default: - ret = -ENOSYS; - } - return ret; -} - - -SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct timespec __user *, utime, u32 __user *, uaddr2, - u32, val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - u32 val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { - if (copy_from_user(&ts, utime, sizeof(ts)) != 0) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. - * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. - */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} - -static int __init futex_init(void) -{ - u32 curval; - int i; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non functional ones will return - * -ENOSYS. - */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) - futex_cmpxchg_enabled = 1; - - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } - - return 0; -} -__initcall(futex_init); -/* - * linux/kernel/futex_compat.c - * - * Futex compatibililty routines. - * - * Copyright 2006, Red Hat, Inc., Ingo Molnar - */ - -#include -#include -#include -#include - -#include - - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int -fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, int *pi) -{ - if (get_user(*uentry, head)) - return -EFAULT; - - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; - - return 0; -} - -static void __user *futex_uaddr(struct robust_list __user *entry, - compat_long_t futex_offset) -{ - compat_uptr_t base = ptr_to_compat(entry); - void __user *uaddr = compat_ptr(base + futex_offset); - - return uaddr; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void compat_exit_robust_list(struct task_struct *curr) -{ - struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; - compat_uptr_t uentry, next_uentry, upending; - compat_long_t futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != (struct robust_list __user *) &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * dont process it twice: - */ - if (entry != pending) { - void __user *uaddr = futex_uaddr(entry, futex_offset); - - if (handle_futex_death(uaddr, curr, pi)) - return; - } - if (rc) - return; - uentry = next_uentry; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - if (pending) { - void __user *uaddr = futex_uaddr(pending, futex_offset); - - handle_futex_death(uaddr, curr, pip); - } -} - -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->compat_robust_list = head; - - return 0; -} - -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) -{ - struct compat_robust_list_head __user *head; - unsigned long ret; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (!pid) - head = current->compat_robust_list; - else { - struct task_struct *p; - - ret = -ESRCH; - read_lock(&tasklist_lock); - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_PTRACE)) - goto err_unlock; - head = p->compat_robust_list; - read_unlock(&tasklist_lock); - } - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - read_unlock(&tasklist_lock); - - return ret; -} - -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { - if (get_compat_timespec(&ts, utime)) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) - val2 = (int) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} -/* - * linux/kernel/hrtimer.c - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner - * - * High-resolution kernel timers - * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * Credits: - * based on kernel/timer.c - * - * Help, testing, suggestions, bugfixes, improvements were - * provided by: - * - * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel - * et. al. - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); - -/* - * The timer bases: - * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. - */ -DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = -{ - - .clock_base = - { - { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, - }, - } -}; - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, tomono; - struct timespec xts, tom; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - xts = current_kernel_time(); - tom = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); - - xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); -} - -/* - * Functions and macros which are different for UP/SMP systems are kept in a - * single place - */ -#ifdef CONFIG_SMP - -/* - * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on the lists/queues. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) -{ - struct hrtimer_clock_base *base; - - for (;;) { - base = timer->base; - if (likely(base != NULL)) { - spin_lock_irqsave(&base->cpu_base->lock, *flags); - if (likely(base == timer->base)) - return base; - /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->cpu_base->lock, *flags); - } - cpu_relax(); - } -} - -/* - * Switch the timer base to the current CPU when possible. - */ -static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) -{ - struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - - new_cpu_base = &__get_cpu_var(hrtimer_bases); - new_base = &new_cpu_base->clock_base[base->index]; - - if (base != new_base) { - /* - * We are trying to schedule the timer on the local CPU. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. - */ - if (unlikely(hrtimer_callback_running(timer))) - return base; - - /* See the comment in lock_timer_base() */ - timer->base = NULL; - spin_unlock(&base->cpu_base->lock); - spin_lock(&new_base->cpu_base->lock); - timer->base = new_base; - } - return new_base; -} - -#else /* CONFIG_SMP */ - -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - struct hrtimer_clock_base *base = timer->base; - - spin_lock_irqsave(&base->cpu_base->lock, *flags); - - return base; -} - -# define switch_hrtimer_base(t, b) (b) - -#endif /* !CONFIG_SMP */ - -/* - * Functions for the union type storage format of ktime_t which are - * too large for inlining: - */ -#if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - -/* - * Divide a ktime value by a nanosecond value - */ -u64 ktime_divns(const ktime_t kt, s64 div) -{ - u64 dclc; - int sft = 0; - - dclc = ktime_to_ns(kt); - /* Make sure the divisor is less than 2^32: */ - while (div >> 32) { - sft++; - div >>= 1; - } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; -} -#endif /* BITS_PER_LONG >= 64 */ - -/* - * Add two ktime values and do a safety check for overflow: - */ -ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) -{ - ktime_t res = ktime_add(lhs, rhs); - - /* - * We use KTIME_SEC_MAX here, the maximum timeout which we can - * return to user space in a timespec: - */ - if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) - res = ktime_set(KTIME_SEC_MAX, 0); - - return res; -} - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr hrtimer_debug_descr; - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_init(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) -{ - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_free(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, -}; - -static inline void debug_hrtimer_init(struct hrtimer *timer) -{ - debug_object_init(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_activate(struct hrtimer *timer) -{ - debug_object_activate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) -{ - debug_object_deactivate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} - -void destroy_hrtimer_on_stack(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -#else -static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } -#endif - -/* - * Check, whether the timer is on the callback pending list - */ -static inline int hrtimer_cb_pending(const struct hrtimer *timer) -{ - return timer->state & HRTIMER_STATE_PENDING; -} - -/* - * Remove a timer from the callback pending list - */ -static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) -{ - list_del_init(&timer->cb_entry); -} - -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static int hrtimer_hres_enabled __read_mostly = 1; - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; -} - -__setup("highres=", setup_hrtimer_hres); - -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; -} - -/* - * Is the high resolution mode active ? - */ -static inline int hrtimer_hres_active(void) -{ - return __get_cpu_var(hrtimer_bases).hres_active; -} - -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) -{ - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires; - - cpu_base->expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - - if (!base->first) - continue; - timer = rb_entry(base->first, struct hrtimer, node); - expires = ktime_sub(timer->expires, base->offset); - if (expires.tv64 < cpu_base->expires_next.tv64) - cpu_base->expires_next = expires; - } - - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); -} - -/* - * Shared reprogramming for clock_realtime and clock_monotonic - * - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held - */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; - ktime_t expires = ktime_sub(timer->expires, base->offset); - int res; - - WARN_ON_ONCE(timer->expires.tv64 < 0); - - /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. - */ - if (hrtimer_callback_running(timer)) - return 0; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. - */ - if (expires.tv64 < 0) - return -ETIME; - - if (expires.tv64 >= expires_next->tv64) - return 0; - - /* - * Clockevents returns -ETIME, when the event was in the past. - */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - *expires_next = expires; - return res; -} - - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base; - struct timespec realtime_offset; - unsigned long seq; - - if (!hrtimer_hres_active()) - return; - - do { - seq = read_seqbegin(&xtime_lock); - set_normalized_timespec(&realtime_offset, - -wall_to_monotonic.tv_sec, - -wall_to_monotonic.tv_nsec); - } while (read_seqretry(&xtime_lock, seq)); - - base = &__get_cpu_var(hrtimer_bases); - - /* Adjust CLOCK_REALTIME offset */ - spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = - timespec_to_ktime(realtime_offset); - - hrtimer_force_reprogram(base); - spin_unlock(&base->lock); -} - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hres_timers_resume(void) -{ - /* Retrigger the CPU local events: */ - retrigger_next_event(NULL); -} - -/* - * Initialize the high resolution related parts of cpu_base - */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) -{ - base->expires_next.tv64 = KTIME_MAX; - base->hres_active = 0; -} - -/* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - - /* Timer is expired, act upon the callback mode */ - switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_NO_RESTART: - debug_hrtimer_deactivate(timer); - /* - * We can call the callback from here. No restart - * happens, so no danger of recursion - */ - BUG_ON(timer->function(timer) != HRTIMER_NORESTART); - return 1; - case HRTIMER_CB_IRQSAFE_PERCPU: - case HRTIMER_CB_IRQSAFE_UNLOCKED: - /* - * This is solely for the sched tick emulation with - * dynamic tick support to ensure that we do not - * restart the tick right on the edge and end up with - * the tick timer in the softirq ! The calling site - * takes care of this. Also used for hrtimer sleeper ! - */ - debug_hrtimer_deactivate(timer); - return 1; - case HRTIMER_CB_IRQSAFE: - case HRTIMER_CB_SOFTIRQ: - /* - * Move everything else into the softirq pending list ! - */ - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - timer->state = HRTIMER_STATE_PENDING; - return 1; - default: - BUG(); - } - } - return 0; -} - -/* - * Switch to high resolution mode - */ -static int hrtimer_switch_to_hres(void) -{ - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); - - if (tick_init_highres()) { - local_irq_restore(flags); - printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); - return 0; - } - base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; - - tick_setup_sched_timer(); - - /* "Retrigger" the interrupt to get things going */ - retrigger_next_event(NULL); - local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); - return 1; -} - -static inline void hrtimer_raise_softirq(void) -{ - raise_softirq(HRTIMER_SOFTIRQ); -} - -#else - -static inline int hrtimer_hres_active(void) { return 0; } -static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_raise_softirq(void) { } - -#endif /* CONFIG_HIGH_RES_TIMERS */ - -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} -#endif - -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); -} - -/** - * hrtimer_forward - forward the timer expiry - * @timer: hrtimer to forward - * @now: forward past this time - * @interval: the interval to forward - * - * Forward the timer expiry so it will expire in the future. - * Returns the number of overruns. - */ -u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) -{ - u64 orun = 1; - ktime_t delta; - - delta = ktime_sub(now, timer->expires); - - if (delta.tv64 < 0) - return 0; - - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; - - if (unlikely(delta.tv64 >= interval.tv64)) { - s64 incr = ktime_to_ns(interval); - - orun = ktime_divns(delta, incr); - timer->expires = ktime_add_ns(timer->expires, incr * orun); - if (timer->expires.tv64 > now.tv64) - return orun; - /* - * This (and the ktime_add() below) is the - * correction for exact: - */ - orun++; - } - timer->expires = ktime_add_safe(timer->expires, interval); - - return orun; -} -EXPORT_SYMBOL_GPL(hrtimer_forward); - -/* - * enqueue_hrtimer - internal function to (re)start a timer - * - * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. - */ -static void enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, int reprogram) -{ - struct rb_node **link = &base->active.rb_node; - struct rb_node *parent = NULL; - struct hrtimer *entry; - int leftmost = 1; - - debug_hrtimer_activate(timer); - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct hrtimer, node); - /* - * We dont care about collisions. Nodes with - * the same expiry time stay together. - */ - if (timer->expires.tv64 < entry->expires.tv64) { - link = &(*link)->rb_left; - } else { - link = &(*link)->rb_right; - leftmost = 0; - } - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - if (leftmost) { - /* - * Reprogram the clock event device. When the timer is already - * expired hrtimer_enqueue_reprogram has either called the - * callback or added it to the pending list and raised the - * softirq. - * - * This is a NOP for !HIGHRES - */ - if (reprogram && hrtimer_enqueue_reprogram(timer, base)) - return; - - base->first = &timer->node; - } - - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; -} - -/* - * __remove_hrtimer - internal function to remove a timer - * - * Caller must hold the base lock. - * - * High resolution timer mode reprograms the clock event device when the - * timer is the one which expires next. The caller can disable this by setting - * reprogram to zero. This is useful, when the context does a reprogramming - * anyway (e.g. timer interrupt) - */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) -{ - /* High res. callback list. NOP for !HIGHRES */ - if (hrtimer_cb_pending(timer)) - hrtimer_remove_cb_pending(timer); - else { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) - hrtimer_force_reprogram(base->cpu_base); - } - rb_erase(&timer->node, &base->active); - } - timer->state = newstate; -} - -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) -{ - if (hrtimer_is_queued(timer)) { - int reprogram; - - /* - * Remove the timer and force reprogramming when high - * resolution mode is active and the timer is on the current - * CPU. If we remove a timer on another CPU, reprogramming is - * skipped. The interrupt event on this CPU is fired and - * reprogramming happens in the interrupt handler. This is a - * rare case and less expensive than a smp call. - */ - debug_hrtimer_deactivate(timer); - timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, - reprogram); - return 1; - } - return 0; -} - -/** - * hrtimer_start - (re)start an relative timer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int ret, raise; - - base = lock_hrtimer_base(timer, &flags); - - /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); - - /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base); - - if (mode == HRTIMER_MODE_REL) { - tim = ktime_add_safe(tim, new_base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); -#endif - } - - timer->expires = tim; - - timer_stats_hrtimer_set_start_info(timer); - - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - */ - enqueue_hrtimer(timer, new_base, - new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); - - /* - * The timer may be expired and moved to the cb_pending - * list. We can not raise the softirq with base lock held due - * to a possible deadlock with runqueue lock. - */ - raise = timer->state == HRTIMER_STATE_PENDING; - - /* - * We use preempt_disable to prevent this task from migrating after - * setting up the softirq and raising it. Otherwise, if me migrate - * we will raise the softirq on the wrong CPU. - */ - preempt_disable(); - - unlock_hrtimer_base(timer, &flags); - - if (raise) - hrtimer_raise_softirq(); - preempt_enable(); - - return ret; -} -EXPORT_SYMBOL_GPL(hrtimer_start); - -/** - * hrtimer_try_to_cancel - try to deactivate a timer - * @timer: hrtimer to stop - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently excuting the callback function and - * cannot be stopped - */ -int hrtimer_try_to_cancel(struct hrtimer *timer) -{ - struct hrtimer_clock_base *base; - unsigned long flags; - int ret = -1; - - base = lock_hrtimer_base(timer, &flags); - - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); - - unlock_hrtimer_base(timer, &flags); - - return ret; - -} -EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); - -/** - * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * @timer: the timer to be cancelled - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - */ -int hrtimer_cancel(struct hrtimer *timer) -{ - for (;;) { - int ret = hrtimer_try_to_cancel(timer); - - if (ret >= 0) - return ret; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(hrtimer_cancel); - -/** - * hrtimer_get_remaining - get remaining time for the timer - * @timer: the timer to read - */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) -{ - struct hrtimer_clock_base *base; - unsigned long flags; - ktime_t rem; - - base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, base->get_time()); - unlock_hrtimer_base(timer, &flags); - - return rem; -} -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); - -#ifdef CONFIG_NO_HZ -/** - * hrtimer_get_next_event - get the time until next expiry event - * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. - */ -ktime_t hrtimer_get_next_event(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; - unsigned long flags; - int i; - - spin_lock_irqsave(&cpu_base->lock, flags); - - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - - if (!base->first) - continue; - - timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } - - spin_unlock_irqrestore(&cpu_base->lock, flags); - - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; -} -#endif - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - struct hrtimer_cpu_base *cpu_base; - - memset(timer, 0, sizeof(struct hrtimer)); - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) - clock_id = CLOCK_MONOTONIC; - - timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); - hrtimer_init_timer_hres(timer); - -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif -} - -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: timer mode abs/rel - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_hrtimer_init(timer); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - struct hrtimer_cpu_base *cpu_base; - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); - - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - -static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) -{ - spin_lock_irq(&cpu_base->lock); - - while (!list_empty(&cpu_base->cb_pending)) { - enum hrtimer_restart (*fn)(struct hrtimer *); - struct hrtimer *timer; - int restart; - - timer = list_entry(cpu_base->cb_pending.next, - struct hrtimer, cb_entry); - - debug_hrtimer_deactivate(timer); - timer_stats_account_hrtimer(timer); - - fn = timer->function; - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); - spin_unlock_irq(&cpu_base->lock); - - restart = fn(timer); - - spin_lock_irq(&cpu_base->lock); - - timer->state &= ~HRTIMER_STATE_CALLBACK; - if (restart == HRTIMER_RESTART) { - BUG_ON(hrtimer_active(timer)); - /* - * Enqueue the timer, allow reprogramming of the event - * device - */ - enqueue_hrtimer(timer, timer->base, 1); - } else if (hrtimer_active(timer)) { - /* - * If the timer was rearmed on another CPU, reprogram - * the event device. - */ - struct hrtimer_clock_base *base = timer->base; - - if (base->first == &timer->node && - hrtimer_reprogram(timer, base)) { - /* - * Timer is expired. Thus move it from tree to - * pending list again. - */ - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - } - } - } - spin_unlock_irq(&cpu_base->lock); -} - -static void __run_hrtimer(struct hrtimer *timer) -{ - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; - enum hrtimer_restart (*fn)(struct hrtimer *); - int restart; - - debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); - timer_stats_account_hrtimer(timer); - - fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || - timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { - /* - * Used for scheduler timers, avoid lock inversion with - * rq->lock and tasklist_lock. - * - * These timers are required to deal with enqueue expiry - * themselves and are not allowed to migrate. - */ - spin_unlock(&cpu_base->lock); - restart = fn(timer); - spin_lock(&cpu_base->lock); - } else - restart = fn(timer); - - /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid - * reprogramming of the event hardware. This happens at the end of this - * function anyway. - */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base, 0); - } - timer->state &= ~HRTIMER_STATE_CALLBACK; -} - -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int i, raise = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - retry: - now = ktime_get(); - - expires_next.tv64 = KTIME_MAX; - - base = cpu_base->clock_base; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - ktime_t basenow; - struct rb_node *node; - - spin_lock(&cpu_base->lock); - - basenow = ktime_add(now, base->offset); - - while ((node = base->first)) { - struct hrtimer *timer; - - timer = rb_entry(node, struct hrtimer, node); - - if (basenow.tv64 < timer->expires.tv64) { - ktime_t expires; - - expires = ktime_sub(timer->expires, - base->offset); - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - break; - } - - /* Move softirq callbacks to the pending list */ - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - raise = 1; - continue; - } - - __run_hrtimer(timer); - } - spin_unlock(&cpu_base->lock); - base++; - } - - cpu_base->expires_next = expires_next; - - /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) - goto retry; - } - - /* Raise softirq ? */ - if (raise) - raise_softirq(HRTIMER_SOFTIRQ); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); -} - -#endif /* CONFIG_HIGH_RES_TIMERS */ - -/* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); - - run_hrtimer_pending(cpu_base); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct rb_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; - - if (hrtimer_hres_active()) - return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - - if (!base->first) - continue; - - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - spin_lock(&cpu_base->lock); - - while ((node = base->first)) { - struct hrtimer *timer; - - timer = rb_entry(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= timer->expires.tv64) - break; - - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, - HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); - continue; - } - - __run_hrtimer(timer); - } - spin_unlock(&cpu_base->lock); - } -} - -/* - * Sleep related functions: - */ -static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) -{ - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); - struct task_struct *task = t->task; - - t->task = NULL; - if (task) - wake_up_process(task); - - return HRTIMER_NORESTART; -} - -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) -{ - sl->timer.function = hrtimer_wakeup; - sl->task = task; -#ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; -#endif -} - -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) -{ - hrtimer_init_sleeper(t, current); - - do { - set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start(&t->timer, t->timer.expires, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; - - if (likely(t->task)) - schedule(); - - hrtimer_cancel(&t->timer); - mode = HRTIMER_MODE_ABS; - - } while (t->task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = ktime_sub(timer->expires, timer->base->get_time()); - if (rem.tv64 <= 0) - return 0; - rmt = ktime_to_timespec(rem); - - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; -} - -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) -{ - struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; - - hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, - HRTIMER_MODE_ABS); - t.timer.expires.tv64 = restart->nanosleep.expires; - - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid) -{ - struct restart_block *restart; - struct hrtimer_sleeper t; - int ret = 0; - - hrtimer_init_on_stack(&t.timer, clockid, mode); - t.timer.expires = timespec_to_ktime(*rqtp); - if (do_nanosleep(&t, mode)) - goto out; - - /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_MODE_ABS) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - restart = ¤t_thread_info()->restart_block; - restart->fn = hrtimer_nanosleep_restart; - restart->nanosleep.index = t.timer.base->index; - restart->nanosleep.rmtp = rmtp; - restart->nanosleep.expires = t.timer.expires.tv64; - - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct timespec tu; - - if (copy_from_user(&tu, rqtp, sizeof(tu))) - return -EFAULT; - - if (!timespec_valid(&tu)) - return -EINVAL; - - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); -} - -/* - * Functions related to boot-time initialization: - */ -static void __cpuinit init_hrtimers_cpu(int cpu) -{ - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - - spin_lock_init(&cpu_base->lock); - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - cpu_base->clock_base[i].cpu_base = cpu_base; - - INIT_LIST_HEAD(&cpu_base->cb_pending); - hrtimer_init_hres(cpu_base); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base, int dcpu) -{ - struct hrtimer *timer; - struct rb_node *node; - int raise = 0; - - while ((node = rb_first(&old_base->active))) { - timer = rb_entry(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); - - /* - * Should not happen. Per CPU timers should be - * canceled _before_ the migration code is called - */ - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { - __remove_hrtimer(timer, old_base, - HRTIMER_STATE_INACTIVE, 0); - WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", - timer, timer->function, dcpu); - continue; - } - - /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the - * timer could be seen as !active and just vanish away - * under us on another CPU - */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); - timer->base = new_base; - /* - * Enqueue the timer. Allow reprogramming of the event device - */ - enqueue_hrtimer(timer, new_base, 1); - -#ifdef CONFIG_HIGH_RES_TIMERS - /* - * Happens with high res enabled when the timer was - * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The - * enqueue code does not move them to the soft irq - * pending list for performance/latency reasons, but - * in the migration state, we need to do that - * otherwise we end up with a stale timer. - */ - if (timer->state == HRTIMER_STATE_MIGRATE) { - timer->state = HRTIMER_STATE_PENDING; - list_add_tail(&timer->cb_entry, - &new_base->cpu_base->cb_pending); - raise = 1; - } -#endif - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; - } - return raise; -} - -#ifdef CONFIG_HIGH_RES_TIMERS -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - struct hrtimer *timer; - int raise = 0; - - while (!list_empty(&old_base->cb_pending)) { - timer = list_entry(old_base->cb_pending.next, - struct hrtimer, cb_entry); - - __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); - timer->base = &new_base->clock_base[timer->base->index]; - list_add_tail(&timer->cb_entry, &new_base->cb_pending); - raise = 1; - } - return raise; -} -#else -static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, - struct hrtimer_cpu_base *new_base) -{ - return 0; -} -#endif - -static void migrate_hrtimers(int cpu) -{ - struct hrtimer_cpu_base *old_base, *new_base; - int i, raise = 0; - - BUG_ON(cpu_online(cpu)); - old_base = &per_cpu(hrtimer_bases, cpu); - new_base = &get_cpu_var(hrtimer_bases); - - tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i], cpu)) - raise = 1; - } - - if (migrate_hrtimer_pending(old_base, new_base)) - raise = 1; - - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); - put_cpu_var(hrtimer_bases); - - if (raise) - hrtimer_raise_softirq(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(cpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); - migrate_hrtimers(cpu); - break; -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - -void __init hrtimers_init(void) -{ - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif -} - -/* - * linux/kernel/irq/autoprobe.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains the interrupt probing code and driver APIs. - */ - -#include -#include -#include -#include - -#include "internals.h" - -/* - * Autodetection depends on the fact that any interrupt that - * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. - */ -static DEFINE_MUTEX(probing_active); - -/** - * probe_irq_on - begin an interrupt autodetect - * - * Commence probing for an interrupt. The interrupts are scanned - * and a mask of potential interrupt lines is returned. - * - */ -unsigned long probe_irq_on(void) -{ - struct irq_desc *desc; - unsigned long mask; - unsigned int i; - - mutex_lock(&probing_active); - /* - * something may have generated an irq long ago and we want to - * flush such a longstanding irq before considering it as spurious. - */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - - spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - - /* - * Some chips need to know about probing in - * progress: - */ - if (desc->chip->set_type) - desc->chip->set_type(i, IRQ_TYPE_PROBE); - desc->chip->startup(i); - } - spin_unlock_irq(&desc->lock); - } - - /* Wait for longstanding interrupts to trigger. */ - msleep(20); - - /* - * enable any unassigned irqs - * (we must startup again here because if a longstanding irq - * happened in the previous stage, it may have masked itself) - */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - - spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->chip->startup(i)) - desc->status |= IRQ_PENDING; - } - spin_unlock_irq(&desc->lock); - } - - /* - * Wait for spurious interrupts to trigger - */ - msleep(100); - - /* - * Now filter out any obviously spurious interrupts - */ - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - unsigned int status; - - desc = irq_desc + i; - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); - } else - if (i < 32) - mask |= 1 << i; - } - spin_unlock_irq(&desc->lock); - } - - return mask; -} -EXPORT_SYMBOL(probe_irq_on); - -/** - * probe_irq_mask - scan a bitmap of interrupt lines - * @val: mask of interrupts to consider - * - * Scan the interrupt lines and return a bitmap of active - * autodetect interrupts. The interrupt probe logic state - * is then returned to its previous value. - * - * Note: we need to scan all the irq's even though we will - * only return autodetect irq numbers - just so that we reset - * them all to a known state. - */ -unsigned int probe_irq_mask(unsigned long val) -{ - unsigned int mask; - int i; - - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; - - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) - mask |= 1 << i; - - desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); - } - spin_unlock_irq(&desc->lock); - } - mutex_unlock(&probing_active); - - return mask & val; -} -EXPORT_SYMBOL(probe_irq_mask); - -/** - * probe_irq_off - end an interrupt autodetect - * @val: mask of potential interrupts (unused) - * - * Scans the unused interrupt lines and returns the line which - * appears to have triggered the interrupt. If no interrupt was - * found then zero is returned. If more than one interrupt is - * found then minus the first candidate is returned to indicate - * their is doubt. - * - * The interrupt probe logic state is returned to its previous - * value. - * - * BUGS: When used in a module (which arguably shouldn't happen) - * nothing prevents two IRQ probe callers from overlapping. The - * results of this are non-optimal. - */ -int probe_irq_off(unsigned long val) -{ - int i, irq_found = 0, nr_irqs = 0; - - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; - - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { - if (!nr_irqs) - irq_found = i; - nr_irqs++; - } - desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); - } - spin_unlock_irq(&desc->lock); - } - mutex_unlock(&probing_active); - - if (nr_irqs > 1) - irq_found = -irq_found; - - return irq_found; -} -EXPORT_SYMBOL(probe_irq_off); - -/* - * linux/kernel/irq/chip.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code, for irq-chip - * based architectures. - * - * Detailed information is available in Documentation/DocBook/genericirq - */ - -#include -#include -#include -#include -#include - -#include "internals.h" - -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - return; - } - - /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DISABLED; - desc->chip = &no_irq_chip; - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->msi_desc = NULL; - desc->handler_data = NULL; - desc->chip_data = NULL; - desc->action = NULL; - desc->irq_count = 0; - desc->irqs_unhandled = 0; -#ifdef CONFIG_SMP - cpus_setall(desc->affinity); -#endif - spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - return; - } - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - if (desc->action) { - spin_unlock_irqrestore(&desc->lock, flags); - WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", - irq); - return; - } - desc->msi_desc = NULL; - desc->handler_data = NULL; - desc->chip_data = NULL; - desc->handle_irq = handle_bad_irq; - desc->chip = &no_irq_chip; - desc->name = NULL; - spin_unlock_irqrestore(&desc->lock, flags); -} - - -/** - * set_irq_chip - set the irq chip for an irq - * @irq: irq number - * @chip: pointer to irq chip description structure - */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); - return -EINVAL; - } - - if (!chip) - chip = &no_irq_chip; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - irq_chip_set_defaults(chip); - desc->chip = chip; - spin_unlock_irqrestore(&desc->lock, flags); - - return 0; -} -EXPORT_SYMBOL(set_irq_chip); - -/** - * set_irq_type - set the irq type for an irq - * @irq: irq number - * @type: interrupt type - see include/linux/interrupt.h - */ -int set_irq_type(unsigned int irq, unsigned int type) -{ - struct irq_desc *desc; - unsigned long flags; - int ret = -ENXIO; - - if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } - - desc = irq_desc + irq; - if (desc->chip->set_type) { - spin_lock_irqsave(&desc->lock, flags); - ret = desc->chip->set_type(irq, type); - spin_unlock_irqrestore(&desc->lock, flags); - } - return ret; -} -EXPORT_SYMBOL(set_irq_type); - -/** - * set_irq_data - set irq type data for an irq - * @irq: Interrupt number - * @data: Pointer to interrupt specific data - * - * Set the hardware irq controller data for an irq - */ -int set_irq_data(unsigned int irq, void *data) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); - return -EINVAL; - } - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - desc->handler_data = data; - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} -EXPORT_SYMBOL(set_irq_data); - -/** - * set_irq_data - set irq type data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data - * - * Set the hardware irq controller data for an irq - */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); - return -EINVAL; - } - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - desc->msi_desc = entry; - if (entry) - entry->irq = irq; - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - -/** - * set_irq_chip_data - set irq chip data for an irq - * @irq: Interrupt number - * @data: Pointer to chip specific data - * - * Set the hardware irq chip data for an irq - */ -int set_irq_chip_data(unsigned int irq, void *data) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned long flags; - - if (irq >= NR_IRQS || !desc->chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); - return -EINVAL; - } - - spin_lock_irqsave(&desc->lock, flags); - desc->chip_data = data; - spin_unlock_irqrestore(&desc->lock, flags); - - return 0; -} -EXPORT_SYMBOL(set_irq_chip_data); - -/* - * default enable function - */ -static void default_enable(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; -} - -/* - * default disable function - */ -static void default_disable(unsigned int irq) -{ -} - -/* - * default startup function - */ -static unsigned int default_startup(unsigned int irq) -{ - irq_desc[irq].chip->enable(irq); - - return 0; -} - -/* - * default shutdown function - */ -static void default_shutdown(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - desc->chip->mask(irq); - desc->status |= IRQ_MASKED; -} - -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) -{ - if (!chip->enable) - chip->enable = default_enable; - if (!chip->disable) - chip->disable = default_disable; - if (!chip->startup) - chip->startup = default_startup; - /* - * We use chip->disable, when the user provided its own. When - * we have default_disable set for chip->disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->shutdown) - chip->shutdown = chip->disable != default_disable ? - chip->disable : default_shutdown; - if (!chip->name) - chip->name = chip->typename; - if (!chip->end) - chip->end = dummy_irq_chip.end; -} - -static inline void mask_ack_irq(struct irq_desc *desc, int irq) -{ - if (desc->chip->mask_ack) - desc->chip->mask_ack(irq); - else { - desc->chip->mask(irq); - desc->chip->ack(irq); - } -} - -/** - * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Simple interrupts are either sent from a demultiplexing interrupt - * handler or come from hardware, where no interrupt hardware control - * is necessary. - * - * Note: The caller is expected to handle the ack, clear, mask and - * unmask issues if necessary. - */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) -{ - struct irqaction *action; - irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); - - spin_lock(&desc->lock); - - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; - - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) - goto out_unlock; - - desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; -out_unlock: - spin_unlock(&desc->lock); -} - -/** - * handle_level_irq - Level type irq handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Level type interrupts are active as long as the hardware line has - * the active level. This may require to mask the interrupt and unmask - * it after the associated handler has acknowledged the device, so the - * interrupt line is back to inactive. - */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) -{ - unsigned int cpu = smp_processor_id(); - struct irqaction *action; - irqreturn_t action_ret; - - spin_lock(&desc->lock); - mask_ack_irq(desc, irq); - - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; - - /* - * If its disabled or no action available - * keep it masked and get out of here - */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) - goto out_unlock; - - desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; - if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) - desc->chip->unmask(irq); -out_unlock: - spin_unlock(&desc->lock); -} - -/** - * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Only a single callback will be issued to the chip: an ->eoi() - * call when the interrupt has been serviced. This enables support - * for modern forms of interrupt handlers, which handle the flow - * details in hardware, transparently. - */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) -{ - unsigned int cpu = smp_processor_id(); - struct irqaction *action; - irqreturn_t action_ret; - - spin_lock(&desc->lock); - - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; - - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - desc->status |= IRQ_PENDING; - if (desc->chip->mask) - desc->chip->mask(irq); - goto out; - } - - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; -out: - desc->chip->eoi(irq); - - spin_unlock(&desc->lock); -} - -/** - * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurence is latched into the irq controller hardware - * and must be acked in order to be reenabled. After the ack another - * interrupt can happen on the same source even before the first one - * is handled by the assosiacted event handler. If this happens it - * might be necessary to disable (mask) the interrupt depending on the - * controller hardware. This requires to reenable the interrupt inside - * of the loop which handles the interrupts which have arrived while - * the handler was running. If all pending interrupts are handled, the - * loop is left. - */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) -{ - const unsigned int cpu = smp_processor_id(); - - spin_lock(&desc->lock); - - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - - /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out - */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc, irq); - goto out_unlock; - } - - kstat_cpu(cpu).irqs[irq]++; - - /* Start handling the irq */ - desc->chip->ack(irq); - - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { - desc->chip->mask(irq); - goto out_unlock; - } - - /* - * When another irq arrived while we were handling - * one, we could have masked the irq. - * Renable it, if it was not disabled in meantime. - */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; - } - - desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); - - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); - - desc->status &= ~IRQ_INPROGRESS; -out_unlock: - spin_unlock(&desc->lock); -} - -/** - * handle_percpu_IRQ - Per CPU local irq handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Per CPU interrupts on SMP machines without locking requirements - */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) -{ - irqreturn_t action_ret; - - kstat_this_cpu.irqs[irq]++; - - if (desc->chip->ack) - desc->chip->ack(irq); - - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - if (desc->chip->eoi) - desc->chip->eoi(irq); -} - -void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); - return; - } - - desc = irq_desc + irq; - - if (!handle) - handle = handle_bad_irq; - else if (desc->chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->chip = &dummy_irq_chip; - } - - spin_lock_irqsave(&desc->lock, flags); - - /* Uninstall? */ - if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) - mask_ack_irq(desc, irq); - desc->status |= IRQ_DISABLED; - desc->depth = 1; - } - desc->handle_irq = handle; - desc->name = name; - - if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->chip->unmask(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); -} - -void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); -} - -void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle, const char *name) -{ - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); -} - -void __init set_irq_noprobe(unsigned int irq) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); - - return; - } - - desc = irq_desc + irq; - - spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); -} - -void __init set_irq_probe(unsigned int irq) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - - return; - } - - desc = irq_desc + irq; - - spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); -} -#include -#include -#include -#include - -/* - * Device resource management aware IRQ request/free implementation. - */ -struct irq_devres { - unsigned int irq; - void *dev_id; -}; - -static void devm_irq_release(struct device *dev, void *res) -{ - struct irq_devres *this = res; - - free_irq(this->irq, this->dev_id); -} - -static int devm_irq_match(struct device *dev, void *res, void *data) -{ - struct irq_devres *this = res, *match = data; - - return this->irq == match->irq && this->dev_id == match->dev_id; -} - -/** - * devm_request_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, dev_free_irq() must be used. - */ -int devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) -{ - struct irq_devres *dr; - int rc; - - dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), - GFP_KERNEL); - if (!dr) - return -ENOMEM; - - rc = request_irq(irq, handler, irqflags, devname, dev_id); - if (rc) { - devres_free(dr); - return rc; - } - - dr->irq = irq; - dr->dev_id = dev_id; - devres_add(dev, dr); - - return 0; -} -EXPORT_SYMBOL(devm_request_irq); - -/** - * devm_free_irq - free an interrupt - * @dev: device to free interrupt for - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as free_irq(). - * This function instead of free_irq() should be used to manually - * free IRQs allocated with dev_request_irq(). - */ -void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) -{ - struct irq_devres match_data = { irq, dev_id }; - - free_irq(irq, dev_id); - WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, - &match_data)); -} -EXPORT_SYMBOL(devm_free_irq); -/* - * linux/kernel/irq/handle.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/DocBook/genericirq - * - */ - -#include -#include -#include -#include -#include - -#include "internals.h" - -/** - * handle_bad_irq - handle spurious and unhandled irqs - * @irq: the interrupt number - * @desc: description of the interrupt - * - * Handles spurious and unhandled IRQ's. It also prints a debugmessage. - */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) -{ - print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; - ack_bad_irq(irq); -} - -/* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the appropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * The code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic or - * having to touch the generic code. - * - * Controller mappings for all interrupt sources: - */ -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif - } -}; - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(unsigned int irq) -{ - print_irq_desc(irq, irq_desc + irq); - ack_bad_irq(irq); -} - -/* - * NOP functions - */ -static void noop(unsigned int irq) -{ -} - -static unsigned int noop_ret(unsigned int irq) -{ - return 0; -} - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = ack_bad, - .end = noop, -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = noop, - .mask = noop, - .unmask = noop, - .end = noop, -}; - -/* - * Special, empty irq handler: - */ -irqreturn_t no_action(int cpl, void *dev_id) -{ - return IRQ_NONE; -} - -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) -{ - irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; - - handle_dynamic_tick(action); - - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); - - do { - ret = action->handler(irq, action->dev_id); - if (ret == IRQ_HANDLED) - status |= action->flags; - retval |= ret; - action = action->next; - } while (action); - - if (status & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); - local_irq_disable(); - - return retval; -} - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - unsigned int status; - - kstat_this_cpu.irqs[irq]++; - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->chip->ack) - desc->chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->chip->end(irq); - return 1; - } - - spin_lock(&desc->lock); - if (desc->chip->ack) - desc->chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->chip->end(irq); - spin_unlock(&desc->lock); - - return 1; -} -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - -void early_init_irq_lock_class(void) -{ - int i; - - for (i = 0; i < NR_IRQS; i++) - lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); -} - -#endif -/* - * linux/kernel/irq/manage.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006 Thomas Gleixner - * - * This file contains driver APIs to the irq subsystem. - */ - -#include -#include -#include -#include -#include - -#include "internals.h" - -#ifdef CONFIG_SMP - -cpumask_t irq_default_affinity = CPU_MASK_ALL; - -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned int status; - - if (irq >= NR_IRQS) - return; - - do { - unsigned long flags; - - /* - * Wait until we're out of the critical section. This might - * give the wrong answer due to the lack of memory barriers. - */ - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); - - /* Ok, that indicated we're done: double-check carefully. */ - spin_lock_irqsave(&desc->lock, flags); - status = desc->status; - spin_unlock_irqrestore(&desc->lock, flags); - - /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); -} -EXPORT_SYMBOL(synchronize_irq); - -/** - * irq_can_set_affinity - Check if the affinity of a given irq can be set - * @irq: Interrupt to check - * - */ -int irq_can_set_affinity(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || - !desc->chip->set_affinity) - return 0; - - return 1; -} - -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @cpumask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, cpumask_t cpumask) -{ - struct irq_desc *desc = irq_desc + irq; - - if (!desc->chip->set_affinity) - return -EINVAL; - - set_balance_irq_affinity(irq, cpumask); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); -#else - desc->affinity = cpumask; - desc->chip->set_affinity(irq, cpumask); -#endif - return 0; -} - -#ifndef CONFIG_AUTO_IRQ_AFFINITY -/* - * Generic version of the affinity autoselector. - */ -int irq_select_affinity(unsigned int irq) -{ - cpumask_t mask; - - if (!irq_can_set_affinity(irq)) - return 0; - - cpus_and(mask, cpu_online_map, irq_default_affinity); - - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); - - set_balance_irq_affinity(irq, mask); - return 0; -} -#endif - -#endif - -/** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. - * - * This function may be called from IRQ context. - */ -void disable_irq_nosync(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned long flags; - - if (irq >= NR_IRQS) - return; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); -} -EXPORT_SYMBOL(disable_irq_nosync); - -/** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void disable_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - if (irq >= NR_IRQS) - return; - - disable_irq_nosync(irq); - if (desc->action) - synchronize_irq(irq); -} -EXPORT_SYMBOL(disable_irq); - -static void __enable_irq(struct irq_desc *desc, unsigned int irq) -{ - switch (desc->depth) { - case 0: - WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - break; - case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; - check_irq_resend(desc, irq); - /* fall-through */ - } - default: - desc->depth--; - } -} - -/** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable - * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. - * - * This function may be called from IRQ context. - */ -void enable_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned long flags; - - if (irq >= NR_IRQS) - return; - - spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq); - spin_unlock_irqrestore(&desc->lock, flags); -} -EXPORT_SYMBOL(enable_irq); - -int set_irq_wake_real(unsigned int irq, unsigned int on) -{ - struct irq_desc *desc = irq_desc + irq; - int ret = -ENXIO; - - if (desc->chip->set_wake) - ret = desc->chip->set_wake(irq, on); - - return ret; -} - -/** - * set_irq_wake - control irq power management wakeup - * @irq: interrupt to control - * @on: enable/disable power management wakeup - * - * Enable/disable power management wakeup mode, which is - * disabled by default. Enables and disables must match, - * just as they match for non-wakeup mode support. - * - * Wakeup mode lets this IRQ wake the system from sleep - * states like "suspend to RAM". - */ -int set_irq_wake(unsigned int irq, unsigned int on) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned long flags; - int ret = 0; - - /* wakeup-capable irqs can be shared between drivers that - * don't need to have the same sleep mode behaviors. - */ - spin_lock_irqsave(&desc->lock, flags); - if (on) { - if (desc->wake_depth++ == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 0; - else - desc->status |= IRQ_WAKEUP; - } - } else { - if (desc->wake_depth == 0) { - WARN(1, "Unbalanced IRQ %d wake disable\n", irq); - } else if (--desc->wake_depth == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 1; - else - desc->status &= ~IRQ_WAKEUP; - } - } - - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -EXPORT_SYMBOL(set_irq_wake); - -/* - * Internal function that tells the architecture code whether a - * particular irq has been exclusively allocated or is available - * for driver use. - */ -int can_request_irq(unsigned int irq, unsigned long irqflags) -{ - struct irqaction *action; - - if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) - return 0; - - action = irq_desc[irq].action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - return !action; -} - -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; -} - -static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, - unsigned long flags) -{ - int ret; - - if (!chip || !chip->set_type) { - /* - * IRQF_TRIGGER_* but the PIC does not support multiple - * flow-types? - */ - pr_warning("No set_type function for IRQ %d (%s)\n", irq, - chip ? (chip->name ? : "unknown") : "unknown"); - return 0; - } - - ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); - - if (ret) - pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)(flags & IRQF_TRIGGER_MASK), - irq, chip->set_type); - - return ret; -} - -/* - * Internal function to register an irqaction - typically used to - * allocate special interrupts that are part of the architecture. - */ -int setup_irq(unsigned int irq, struct irqaction *new) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *old, **p; - const char *old_name = NULL; - unsigned long flags; - int shared = 0; - int ret; - - if (irq >= NR_IRQS) - return -EINVAL; - - if (desc->chip == &no_irq_chip) - return -ENOSYS; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } - - /* - * The following block of code has to be executed atomically - */ - spin_lock_irqsave(&desc->lock, flags); - p = &desc->action; - old = *p; - if (old) { - /* - * Can't share interrupts unless both agree to and are - * the same type (level, edge, polarity). So both flag - * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. - */ - if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { - old_name = old->name; - goto mismatch; - } - -#if defined(CONFIG_IRQ_PER_CPU) - /* All handlers must agree on per-cpuness */ - if ((old->flags & IRQF_PERCPU) != - (new->flags & IRQF_PERCPU)) - goto mismatch; -#endif - - /* add new interrupt at end of irq queue */ - do { - p = &old->next; - old = *p; - } while (old); - shared = 1; - } - - if (!shared) { - irq_chip_set_defaults(desc->chip); - - /* Setup the type (level, edge polarity) if configured: */ - if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc->chip, irq, new->flags); - - if (ret) { - spin_unlock_irqrestore(&desc->lock, flags); - return ret; - } - } else - compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif - - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); - - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - if (desc->chip->startup) - desc->chip->startup(irq); - else - desc->chip->enable(irq); - } else - /* Undo nested disables: */ - desc->depth = 1; - - /* Set default affinity mask once everything is setup */ - irq_select_affinity(irq); - } - - *p = new; - - /* Exclude IRQ from balancing */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; - - /* Reset broken irq detection when installing new handler */ - desc->irq_count = 0; - desc->irqs_unhandled = 0; - - /* - * Check whether we disabled the irq via the spurious handler - * before. Reenable it and give it another chance. - */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; - __enable_irq(desc, irq); - } - - spin_unlock_irqrestore(&desc->lock, flags); - - new->irq = irq; - register_irq_proc(irq); - new->dir = NULL; - register_handler_proc(irq, new); - - return 0; - -mismatch: -#ifdef CONFIG_DEBUG_SHIRQ - if (!(new->flags & IRQF_PROBE_SHARED)) { - printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); - if (old_name) - printk(KERN_ERR "current handler: %s\n", old_name); - dump_stack(); - } -#endif - spin_unlock_irqrestore(&desc->lock, flags); - return -EBUSY; -} - -/** - * free_irq - free an interrupt - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function must not be called from interrupt context. - */ -void free_irq(unsigned int irq, void *dev_id) -{ - struct irq_desc *desc; - struct irqaction **p; - unsigned long flags; - - WARN_ON(in_interrupt()); - if (irq >= NR_IRQS) - return; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); - p = &desc->action; - for (;;) { - struct irqaction *action = *p; - - if (action) { - struct irqaction **pp = p; - - p = &action->next; - if (action->dev_id != dev_id) - continue; - - /* Found it - now remove it from the list of entries */ - *pp = action->next; - - /* Currently used only by UML, might disappear one day.*/ -#ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); -#endif - - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); - else - desc->chip->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); - unregister_handler_proc(irq, action); - - /* Make sure it's not being used on another CPU */ - synchronize_irq(irq); -#ifdef CONFIG_DEBUG_SHIRQ - /* - * It's a shared IRQ -- the driver ought to be - * prepared for it to happen even now it's - * being freed, so let's make sure.... We do - * this after actually deregistering it, to - * make sure that a 'real' IRQ doesn't run in - * parallel with our fake - */ - if (action->flags & IRQF_SHARED) { - local_irq_save(flags); - action->handler(irq, dev_id); - local_irq_restore(flags); - } -#endif - kfree(action); - return; - } - printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); -#ifdef CONFIG_DEBUG_SHIRQ - dump_stack(); -#endif - spin_unlock_irqrestore(&desc->lock, flags); - return; - } -} -EXPORT_SYMBOL(free_irq); - -/** - * request_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: - * - * IRQF_SHARED Interrupt is shared - * IRQF_DISABLED Disable local interrupts while processing - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy - * - */ -int request_irq(unsigned int irq, irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - struct irqaction *action; - int retval; - -#ifdef CONFIG_LOCKDEP - /* - * Lockdep wants atomic interrupt handlers: - */ - irqflags |= IRQF_DISABLED; -#endif - /* - * Sanity-check: shared interrupts must pass in a real dev-ID, - * otherwise we'll have trouble later trying to figure out - * which interrupt is which (messes up the interrupt freeing - * logic etc). - */ - if ((irqflags & IRQF_SHARED) && !dev_id) - return -EINVAL; - if (irq >= NR_IRQS) - return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) - return -EINVAL; - if (!handler) - return -EINVAL; - - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); - if (!action) - return -ENOMEM; - - action->handler = handler; - action->flags = irqflags; - cpus_clear(action->mask); - action->name = devname; - action->next = NULL; - action->dev_id = dev_id; - -#ifdef CONFIG_DEBUG_SHIRQ_FIXME - if (irqflags & IRQF_SHARED) { - /* - * It's a shared IRQ -- the driver ought to be prepared for it - * to happen immediately, so let's make sure.... - * We do this before actually registering it, to make sure that - * a 'real' IRQ doesn't run in parallel with our fake - */ - unsigned long flags; - - local_irq_save(flags); - handler(irq, dev_id); - local_irq_restore(flags); - } -#endif - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - - return retval; -} -EXPORT_SYMBOL(request_irq); - -#include - -void set_pending_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; - spin_unlock_irqrestore(&desc->lock, flags); -} - -void move_masked_irq(int irq) -{ - struct irq_desc *desc = irq_desc + irq; - cpumask_t tmp; - - if (likely(!(desc->status & IRQ_MOVE_PENDING))) - return; - - /* - * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. - */ - if (CHECK_IRQ_PER_CPU(desc->status)) { - WARN_ON(1); - return; - } - - desc->status &= ~IRQ_MOVE_PENDING; - - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) - return; - - if (!desc->chip->set_affinity) - return; - - assert_spin_locked(&desc->lock); - - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); - - /* - * If there was a valid mask to work with, please - * do the disable, re-program, enable sequence. - * This is *not* particularly important for level triggered - * but in a edge trigger case, we might be setting rte - * when an active trigger is comming in. This could - * cause some ioapics to mal-function. - * Being paranoid i guess! - * - * For correct operation this depends on the caller - * masking the irqs. - */ - if (likely(!cpus_empty(tmp))) { - desc->chip->set_affinity(irq,tmp); - } - cpus_clear(irq_desc[irq].pending_mask); -} - -void move_native_irq(int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - if (likely(!(desc->status & IRQ_MOVE_PENDING))) - return; - - if (unlikely(desc->status & IRQ_DISABLED)) - return; - - desc->chip->mask(irq); - move_masked_irq(irq); - desc->chip->unmask(irq); -} - -/* - * linux/kernel/irq/proc.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains the /proc/irq/ handling code. - */ - -#include -#include -#include -#include - -#include "internals.h" - -static struct proc_dir_entry *root_irq_dir; - -#ifdef CONFIG_SMP - -static int irq_affinity_proc_show(struct seq_file *m, void *v) -{ - struct irq_desc *desc = irq_desc + (long)m->private; - cpumask_t *mask = &desc->affinity; - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) - mask = &desc->pending_mask; -#endif - seq_cpumask(m, mask); - seq_putc(m, '\n'); - return 0; -} - -#ifndef is_affinity_mask_valid -#define is_affinity_mask_valid(val) 1 -#endif - -int no_irq_affinity; -static ssize_t irq_affinity_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_t new_value; - int err; - - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) - return -EIO; - - err = cpumask_parse_user(buffer, count, new_value); - if (err) - return err; - - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; - - /* - * Do not allow disabling IRQs completely - it's a too easy - * way to make the system unusable accidentally :-) At least - * one online CPU still has to be targeted. - */ - if (!cpus_intersects(new_value, cpu_online_map)) - /* Special case for empty set - allow the architecture - code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : count; - - irq_set_affinity(irq, new_value); - - return count; -} - -static int irq_affinity_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_proc_show, PDE(inode)->data); -} - -static const struct file_operations irq_affinity_proc_fops = { - .open = irq_affinity_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = irq_affinity_proc_write, -}; - -static int default_affinity_show(struct seq_file *m, void *v) -{ - seq_cpumask(m, &irq_default_affinity); - seq_putc(m, '\n'); - return 0; -} - -static ssize_t default_affinity_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos) -{ - cpumask_t new_value; - int err; - - err = cpumask_parse_user(buffer, count, new_value); - if (err) - return err; - - if (!is_affinity_mask_valid(new_value)) - return -EINVAL; - - /* - * Do not allow disabling IRQs completely - it's a too easy - * way to make the system unusable accidentally :-) At least - * one online CPU still has to be targeted. - */ - if (!cpus_intersects(new_value, cpu_online_map)) - return -EINVAL; - - irq_default_affinity = new_value; - - return count; -} - -static int default_affinity_open(struct inode *inode, struct file *file) -{ - return single_open(file, default_affinity_show, NULL); -} - -static const struct file_operations default_affinity_proc_fops = { - .open = default_affinity_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = default_affinity_write, -}; -#endif - -static int irq_spurious_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct irq_desc *d = &irq_desc[(long) data]; - return sprintf(page, "count %u\n" - "unhandled %u\n" - "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); -} - -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_desc[irq].dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); -} - -#undef MAX_NAMELEN - -#define MAX_NAMELEN 10 - -void register_irq_proc(unsigned int irq) -{ - char name [MAX_NAMELEN]; - struct proc_dir_entry *entry; - - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) - return; - - memset(name, 0, MAX_NAMELEN); - sprintf(name, "%d", irq); - - /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); - -#ifdef CONFIG_SMP - /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, - &irq_affinity_proc_fops, (void *)(long)irq); -#endif - - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_spurious_read; - } -} - -#undef MAX_NAMELEN - -void unregister_handler_proc(unsigned int irq, struct irqaction *action) -{ - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); -} - -void register_default_affinity_proc(void) -{ -#ifdef CONFIG_SMP - proc_create("irq/default_smp_affinity", 0600, NULL, - &default_affinity_proc_fops); -#endif -} - -void init_irq_proc(void) -{ - int i; - - /* create /proc/irq */ - root_irq_dir = proc_mkdir("irq", NULL); - if (!root_irq_dir) - return; - - register_default_affinity_proc(); - - /* - * Create entries for all existing IRQs. - */ - for (i = 0; i < NR_IRQS; i++) - register_irq_proc(i); -} - -/* - * linux/kernel/irq/resend.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner - * - * This file contains the IRQ-resend code - * - * If the interrupt is waiting to be processed, we try to re-run it. - * We can't directly run it from here since the caller might be in an - * interrupt-protected region. Not all irq controller chips can - * retrigger interrupts at the hardware level, so in those cases - * we allow the resending of IRQs via a tasklet. - */ - -#include -#include -#include -#include - -#include "internals.h" - -#ifdef CONFIG_HARDIRQS_SW_RESEND - -/* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, NR_IRQS); - -/* - * Run software resends of IRQ's - */ -static void resend_irqs(unsigned long arg) -{ - struct irq_desc *desc; - int irq; - - while (!bitmap_empty(irqs_resend, NR_IRQS)) { - irq = find_first_bit(irqs_resend, NR_IRQS); - clear_bit(irq, irqs_resend); - desc = irq_desc + irq; - local_irq_disable(); - desc->handle_irq(irq, desc); - local_irq_enable(); - } -} - -/* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); - -#endif - -/* - * IRQ resend - * - * Is called with interrupts disabled and desc->lock held. - */ -void check_irq_resend(struct irq_desc *desc, unsigned int irq) -{ - unsigned int status = desc->status; - - /* - * Make sure the interrupt is enabled, before resending it: - */ - desc->chip->enable(irq); - - /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. - */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - - if (!desc->chip || !desc->chip->retrigger || - !desc->chip->retrigger(irq)) { -#ifdef CONFIG_HARDIRQS_SW_RESEND - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); - tasklet_schedule(&resend_tasklet); -#endif - } - } -} -/* - * linux/kernel/irq/spurious.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains spurious interrupt handling. - */ - -#include -#include -#include -#include -#include -#include - -static int irqfixup __read_mostly; - -/* - * Recovery handler for misrouted interrupts. - */ -static int misrouted_irq(int irq) -{ - int i; - int ok = 0; - int work = 0; /* Did we do work for a real IRQ */ - - for (i = 1; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - struct irqaction *action; - - if (i == irq) /* Already tried */ - continue; - - spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); - continue; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; - spin_unlock(&desc->lock); - - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - spin_lock(&desc->lock); - action = desc->action; - - /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: - */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; - spin_unlock(&desc->lock); - handle_IRQ_event(i, action); - spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(i); - spin_unlock(&desc->lock); - } - /* So the caller can adjust the irq error counts */ - return ok; -} - -/* - * If 99,900 of the previous 100,000 interrupts have not been handled - * then assume that the IRQ is stuck in some manner. Drop a diagnostic - * and try to turn the IRQ off. - * - * (The other 100-of-100,000 interrupts may have been a correctly - * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock - */ - -static void -__report_bad_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) -{ - struct irqaction *action; - - if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { - printk(KERN_ERR "irq event %d: bogus return value %x\n", - irq, action_ret); - } else { - printk(KERN_ERR "irq %d: nobody cared (try booting with " - "the \"irqpoll\" option)\n", irq); - } - dump_stack(); - printk(KERN_ERR "handlers:\n"); - - action = desc->action; - while (action) { - printk(KERN_ERR "[<%p>]", action->handler); - print_symbol(" (%s)", - (unsigned long)action->handler); - printk("\n"); - action = action->next; - } -} - -static void -report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) -{ - static int count = 100; - - if (count > 0) { - count--; - __report_bad_irq(irq, desc, action_ret); - } -} - -static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) -{ - struct irqaction *action; - - if (!irqfixup) - return 0; - - /* We didn't actually handle the IRQ - see if it was misrouted? */ - if (action_ret == IRQ_NONE) - return 1; - - /* - * But for 'irqfixup == 2' we also do it for handled interrupts if - * they are marked as IRQF_IRQPOLL (or for irq zero, which is the - * traditional PC timer interrupt.. Legacy) - */ - if (irqfixup < 2) - return 0; - - if (!irq) - return 1; - - /* - * Since we don't get the descriptor lock, "action" can - * change under us. We don't really care, but we don't - * want to follow a NULL pointer. So tell the compiler to - * just load it once by using a barrier. - */ - action = desc->action; - barrier(); - return action && (action->flags & IRQF_IRQPOLL); -} - -void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) -{ - if (unlikely(action_ret != IRQ_HANDLED)) { - /* - * If we are seeing only the odd spurious IRQ caused by - * bus asynchronicity then don't eventually trigger an error, - * otherwise the couter becomes a doomsday timer for otherwise - * working systems - */ - if (time_after(jiffies, desc->last_unhandled + HZ/10)) - desc->irqs_unhandled = 1; - else - desc->irqs_unhandled++; - desc->last_unhandled = jiffies; - if (unlikely(action_ret != IRQ_NONE)) - report_bad_irq(irq, desc, action_ret); - } - - if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { - int ok = misrouted_irq(irq); - if (action_ret == IRQ_NONE) - desc->irqs_unhandled -= ok; - } - - desc->irq_count++; - if (likely(desc->irq_count < 100000)) - return; - - desc->irq_count = 0; - if (unlikely(desc->irqs_unhandled > 99900)) { - /* - * The interrupt is stuck - */ - __report_bad_irq(irq, desc, action_ret); - /* - * Now kill the IRQ - */ - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; - desc->depth++; - desc->chip->disable(irq); - } - desc->irqs_unhandled = 0; -} - -int noirqdebug __read_mostly; - -int noirqdebug_setup(char *str) -{ - noirqdebug = 1; - printk(KERN_INFO "IRQ lockup detection disabled\n"); - - return 1; -} - -__setup("noirqdebug", noirqdebug_setup); -module_param(noirqdebug, bool, 0644); -MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); - -static int __init irqfixup_setup(char *str) -{ - irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); - - return 1; -} - -__setup("irqfixup", irqfixup_setup); -module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); - -static int __init irqpoll_setup(char *str) -{ - irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); - printk(KERN_WARNING "This may significantly impact system " - "performance\n"); - return 1; -} - -__setup("irqpoll", irqpoll_setup); -/* - * linux/kernel/itimer.c - * - * Copyright (C) 1992 Darren Senn - */ - -/* These are all the functions necessary to implement itimers */ - -#include -#include -#include -#include -#include -#include - -#include - -/** - * itimer_get_remtime - get remaining time for the timer - * - * @timer: the timer to read - * - * Returns the delta between the expiry time and now, which can be - * less than zero or 1usec for an pending expired timer - */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) -{ - ktime_t rem = hrtimer_get_remaining(timer); - - /* - * Racy but safe: if the itimer expires after the above - * hrtimer_get_remtime() call but before this condition - * then we return 0 - which is correct. - */ - if (hrtimer_active(timer)) { - if (rem.tv64 <= 0) - rem.tv64 = NSEC_PER_USEC; - } else - rem.tv64 = 0; - - return ktime_to_timeval(rem); -} - -int do_getitimer(int which, struct itimerval *value) -{ - struct task_struct *tsk = current; - cputime_t cinterval, cval; - - switch (which) { - case ITIMER_REAL: - spin_lock_irq(&tsk->sighand->siglock); - value->it_value = itimer_get_remtime(&tsk->signal->real_timer); - value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); - break; - case ITIMER_PROF: - read_lock(&tasklist_lock); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); - break; - default: - return(-EINVAL); - } - return 0; -} - -SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) -{ - int error = -EFAULT; - struct itimerval get_buffer; - - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } - return error; -} - - -/* - * The timer is automagically restarted, when interval != 0 - */ -enum hrtimer_restart it_real_fn(struct hrtimer *timer) -{ - struct signal_struct *sig = - container_of(timer, struct signal_struct, real_timer); - - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); - - return HRTIMER_NORESTART; -} - -/* - * Returns true if the timeval is in canonical form - */ -#define timeval_valid(t) \ - (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) - -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) -{ - struct task_struct *tsk = current; - struct hrtimer *timer; - ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; - - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - - switch (which) { - case ITIMER_REAL: -again: - spin_lock_irq(&tsk->sighand->siglock); - timer = &tsk->signal->real_timer; - if (ovalue) { - ovalue->it_value = itimer_get_remtime(timer); - ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); - } - /* We are sharing ->siglock with it_real_fn() */ - if (hrtimer_try_to_cancel(timer) < 0) { - spin_unlock_irq(&tsk->sighand->siglock); - goto again; - } - expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) { - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); - hrtimer_start(timer, expires, HRTIMER_MODE_REL); - } else - tsk->signal->it_real_incr.tv64 = 0; - - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } - break; - case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } - break; - default: - return -EINVAL; - } - return 0; -} - -/** - * alarm_setitimer - set alarm in seconds - * - * @seconds: number of seconds until alarm - * 0 disables the alarm - * - * Returns the remaining time in seconds of a pending timer or 0 when - * the timer is not active. - * - * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid - * negative timeval settings which would cause immediate expiry. - */ -unsigned int alarm_setitimer(unsigned int seconds) -{ - struct itimerval it_new, it_old; - -#if BITS_PER_LONG < 64 - if (seconds > INT_MAX) - seconds = INT_MAX; -#endif - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - - do_setitimer(ITIMER_REAL, &it_new, &it_old); - - /* - * We can't return 0 if we have an alarm pending ... And we'd - * better return too much than too little anyway - */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) - it_old.it_value.tv_sec++; - - return it_old.it_value.tv_sec; -} - -SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, - struct itimerval __user *, ovalue) -{ - struct itimerval set_buffer, get_buffer; - int error; - - if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; - } else - memset((char *) &set_buffer, 0, sizeof(set_buffer)); - - error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); - if (error || !ovalue) - return error; - - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) - return -EFAULT; - return 0; -} -/* - * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. - * - * Rewritten and vastly simplified by Rusty Russell for in-kernel - * module loader: - * Copyright 2002 Rusty Russell IBM Corporation - * - * ChangeLog: - * - * (25/Aug/2004) Paulo Marques - * Changed the compression method from stem compression to "table lookup" - * compression (see scripts/kallsyms.c for a more complete description) - */ -#include -#include -#include -#include -#include -#include -#include -#include /* for cond_resched */ -#include -#include - -#include - -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - -/* These will be re-linked against their real values during the second link stage */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); - -/* tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV) - */ -extern const unsigned long kallsyms_num_syms -__attribute__((weak, section(".rodata"))); - -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); - -extern const unsigned long kallsyms_markers[] __attribute__((weak)); - -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) - return 1; - return in_gate_area_no_task(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_task(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (all_var) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - -/* expand a compressed symbol data into the resulting uncompressed string, - given the offset to where the symbol is in the compressed stream */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) -{ - int len, skipped_first = 0; - const u8 *tptr, *data; - - /* get the compressed symbol length from the first symbol byte */ - data = &kallsyms_names[off]; - len = *data; - data++; - - /* update the offset to return the offset for the next symbol on - * the compressed stream */ - off += len + 1; - - /* for every byte on the compressed symbol data, copy the table - entry for that byte */ - while(len) { - tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; - data++; - len--; - - while (*tptr) { - if(skipped_first) { - *result = *tptr; - result++; - } else - skipped_first = 1; - tptr++; - } - } - - *result = '\0'; - - /* return to offset to the next symbol */ - return off; -} - -/* get symbol type information. This is encoded as a single char at the - * begining of the symbol name */ -static char kallsyms_get_symbol_type(unsigned int off) -{ - /* get just the first code, look it up in the token table, and return the - * first char from this token */ - return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; -} - - -/* find the offset on the compressed stream given and index in the - * kallsyms array */ -static unsigned int get_symbol_offset(unsigned long pos) -{ - const u8 *name; - int i; - - /* use the closest marker we have. We have markers every 256 positions, - * so that should be close enough */ - name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; - - /* sequentially scan all the symbols up to the point we're searching for. - * Every symbol is stored in a [][ bytes of data] format, so we - * just need to add the len to the current pointer for every symbol we - * wish to skip */ - for(i = 0; i < (pos&0xFF); i++) - name = name + (*name) + 1; - - return name - kallsyms_names; -} - -/* Lookup the address for this symbol. Returns 0 if not found. */ -unsigned long kallsyms_lookup_name(const char *name) -{ - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; - - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); - - if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; - } - return module_kallsyms_lookup_name(name); -} - -static unsigned long get_symbol_pos(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset) -{ - unsigned long symbol_start = 0, symbol_end = 0; - unsigned long i, low, high, mid; - - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - - /* do a binary search on the sorted kallsyms_addresses array */ - low = 0; - high = kallsyms_num_syms; - - while (high - low > 1) { - mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) - low = mid; - else - high = mid; - } - - /* - * search for the first aliased symbol. Aliased - * symbols are symbols with the same address - */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) - --low; - - symbol_start = kallsyms_addresses[low]; - - /* Search for next non-aliased symbol */ - for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; - break; - } - } - - /* if we found no next symbol, we use the end of the section */ - if (!symbol_end) { - if (is_kernel_inittext(addr)) - symbol_end = (unsigned long)_einittext; - else if (all_var) - symbol_end = (unsigned long)_end; - else - symbol_end = (unsigned long)_etext; - } - - if (symbolsize) - *symbolsize = symbol_end - symbol_start; - if (offset) - *offset = addr - symbol_start; - - return low; -} - -/* - * Lookup an address but don't bother to find any names. - */ -int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, - unsigned long *offset) -{ - char namebuf[KSYM_NAME_LEN]; - if (is_ksym_addr(addr)) - return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); -} - -/* - * Lookup an address - * - modname is set to NULL if it's in the kernel - * - we guarantee that the returned name is valid until we reschedule even if - * it resides in a module - * - we also guarantee that modname will be valid until rescheduled - */ -const char *kallsyms_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, char *namebuf) -{ - namebuf[KSYM_NAME_LEN - 1] = 0; - namebuf[0] = 0; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, symbolsize, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); - if (modname) - *modname = NULL; - return namebuf; - } - - /* see if it's in a module */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); - return NULL; -} - -int lookup_symbol_name(unsigned long addr, char *symname) -{ - symname[0] = '\0'; - symname[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, NULL, NULL); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); - return 0; - } - /* see if it's in a module */ - return lookup_module_symbol_name(addr, symname); -} - -int lookup_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - name[0] = '\0'; - name[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, size, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); - modname[0] = '\0'; - return 0; - } - /* see if it's in a module */ - return lookup_module_symbol_attrs(addr, size, offset, modname, name); -} - -/* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) -{ - char *modname; - const char *name; - unsigned long offset, size; - char namebuf[KSYM_NAME_LEN]; - - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); - if (!name) - return sprintf(buffer, "0x%lx", address); - - if (modname) - return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, - size, modname); - else - return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); -} - -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} - -/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ -struct kallsym_iter -{ - loff_t pos; - unsigned long value; - unsigned int nameoff; /* If iterating in core kernel symbols */ - char type; - char name[KSYM_NAME_LEN]; - char module_name[MODULE_NAME_LEN]; - int exported; -}; - -static int get_ksymbol_mod(struct kallsym_iter *iter) -{ - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) - return 0; - return 1; -} - -/* Returns space to next name. */ -static unsigned long get_ksymbol_core(struct kallsym_iter *iter) -{ - unsigned off = iter->nameoff; - - iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; - - iter->type = kallsyms_get_symbol_type(off); - - off = kallsyms_expand_symbol(off, iter->name); - - return off - iter->nameoff; -} - -static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) -{ - iter->name[0] = '\0'; - iter->nameoff = get_symbol_offset(new_pos); - iter->pos = new_pos; -} - -/* Returns false if pos at or past end of file. */ -static int update_iter(struct kallsym_iter *iter, loff_t pos) -{ - /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } - - /* If we're not on the desired position, reset to new position. */ - if (pos != iter->pos) - reset_iter(iter, pos); - - iter->nameoff += get_ksymbol_core(iter); - iter->pos++; - - return 1; -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - (*pos)++; - - if (!update_iter(m->private, *pos)) - return NULL; - return p; -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - if (!update_iter(m->private, *pos)) - return NULL; - return m->private; -} - -static void s_stop(struct seq_file *m, void *p) -{ -} - -static int s_show(struct seq_file *m, void *p) -{ - struct kallsym_iter *iter = m->private; - - /* Some debugging symbols have no name. Ignore them. */ - if (!iter->name[0]) - return 0; - - if (iter->module_name[0]) { - char type; - - /* Label it "global" if it is exported, - * "local" if not exported. */ - type = iter->exported ? toupper(iter->type) : - tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2*sizeof(void*)), - iter->value, type, iter->name, iter->module_name); - } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2*sizeof(void*)), - iter->value, iter->type, iter->name); - return 0; -} - -static const struct seq_operations kallsyms_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show -}; - -static int kallsyms_open(struct inode *inode, struct file *file) -{ - /* We keep iterator in m->private, since normal case is to - * s_start from where we left off, so we avoid doing - * using get_symbol_offset for every symbol */ - struct kallsym_iter *iter; - int ret; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - reset_iter(iter, 0); - - ret = seq_open(file, &kallsyms_op); - if (ret == 0) - ((struct seq_file *)file->private_data)->private = iter; - else - kfree(iter); - return ret; -} - -static const struct file_operations kallsyms_operations = { - .open = kallsyms_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static int __init kallsyms_init(void) -{ - proc_create("kallsyms", 0444, NULL, &kallsyms_operations); - return 0; -} -__initcall(kallsyms_init); - -EXPORT_SYMBOL(__print_symbol); -EXPORT_SYMBOL_GPL(sprint_symbol); -/* - * kexec.c - kexec system call - * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t* crash_notes; - -/* vmcoreinfo stuff */ -unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in . - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); - -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unuseable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) - goto out; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addreses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - goto out; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; - } - - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; - -} - -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - - /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; - - *rimage = image; - - /* - * Find a location for the control code buffer, and add it - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; - } - - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); - goto out; - } - - result = 0; - out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; -} - -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - unsigned long i; - - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; - goto out; - } - - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; - - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out; - } - - /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; - } - - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; -} - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(GFP_KERNEL, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) - break; - if (hole_end > crashk_res.end) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unuseable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); - -} -static void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } - else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } - else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - unsigned long ubytes, mbytes; - int result; - unsigned char __user *buf; - - result = 0; - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); - ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; - - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = (result < 0) ? result : -EIO; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - unsigned long ubytes, mbytes; - int result; - unsigned char __user *buf; - - result = 0; - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = (result < 0) ? result : -EIO; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - -/* - * Exec Kernel system call: for obvious reasons only root may call it. - * - * This call breaks up into three pieces. - * - A generic part which loads the new kernel from the current - * address space, and very carefully places the data in the - * allocated pages. - * - * - A generic part that interacts with the kernel and tells all of - * the devices to shut down. Preventing on-going dmas, and placing - * the devices in a consistent state so a later kernel can - * reinitialize them. - * - * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And - * jumps into the image at entry. - * - * kexec does not sync, or unmount filesystems so if you need - * that to happen you need to do that yourself. - */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; - -static DEFINE_MUTEX(kexec_mutex); - -SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, - struct kexec_segment __user *, segments, unsigned long, flags) -{ - struct kimage **dest_image, *image; - int result; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* - * Verify we have a legal set of flags - * This leaves us room for future extensions. - */ - if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) - return -EINVAL; - - /* Verify we are on the appropriate architecture */ - if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && - ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - return -EINVAL; - - /* Put an artificial cap on the number - * of segments passed to kexec_load. - */ - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - - image = NULL; - result = 0; - - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) - dest_image = &kexec_crash_image; - if (nr_segments > 0) { - unsigned long i; - - /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); - /* Loading another kernel to switch to if this one crashes */ - else if (flags & KEXEC_ON_CRASH) { - /* Free any current crash dump kernel before - * we corrupt it. - */ - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); - } - if (result) - goto out; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto out; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto out; - } - kimage_terminate(image); - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); - -out: - mutex_unlock(&kexec_mutex); - kimage_free(image); - - return result; -} - -#ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) -{ - struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; - unsigned long i, result; - - /* Don't allow clients that don't understand the native - * architecture to do anything. - */ - if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) - return -EINVAL; - - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { - result = copy_from_user(&in, &segments[i], sizeof(in)); - if (result) - return -EFAULT; - - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; - } - - return sys_kexec_load(entry, nr_segments, ksegments, flags); -} -#endif - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); - return -ENOMEM; - } - return 0; -} -module_init(crash_notes_memory_init) - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warning("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - - return 0; -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *p = cmdline, *ck_cmdline = NULL; - char *first_colon, *first_space; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); - while (p) { - ck_cmdline = p; - p = strstr(p+1, "crashkernel="); - } - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += 12; /* strlen("crashkernel=") */ - - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - - return 0; -} - - - -void crash_save_vmcoreinfo(void) -{ - u32 *buf; - - if (!vmcoreinfo_size) - return; - - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); - - buf = (u32 *)vmcoreinfo_note; - - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - - final_note(buf); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - int r; - - va_start(args, fmt); - r = vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); - VMCOREINFO_SYMBOL(swapper_pg_dir); - VMCOREINFO_SYMBOL(_stext); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - - arch_crash_save_vmcoreinfo(); - - return 0; -} - -module_init(crash_save_vmcoreinfo_init) - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - mutex_lock(&pm_mutex); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) - goto Resume_console; - error = disable_nonboot_cpus(); - if (error) - goto Resume_devices; - device_pm_lock(); - local_irq_disable(); - /* At this point, device_suspend() has been called, - * but *not* device_power_down(). We *must* - * device_power_down() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = device_power_down(PMSG_FREEZE); - if (error) - goto Enable_irqs; - } else -#endif - { - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - device_power_up(PMSG_RESTORE); - Enable_irqs: - local_irq_enable(); - device_pm_unlock(); - enable_nonboot_cpus(); - Resume_devices: - device_resume(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - mutex_unlock(&pm_mutex); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} -/* - * A simple kernel FIFO implementation. - * - * Copyright (C) 2004 Stelian Pop - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#include -#include -#include -#include - -/** - * kfifo_init - allocates a new FIFO using a preallocated buffer - * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer - * - * Do NOT pass the kfifo to kfifo_free() after use! Simply free the - * &struct kfifo with kfree(). - */ -struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock) -{ - struct kfifo *fifo; - - /* size must be a power of 2 */ - BUG_ON(!is_power_of_2(size)); - - fifo = kmalloc(sizeof(struct kfifo), gfp_mask); - if (!fifo) - return ERR_PTR(-ENOMEM); - - fifo->buffer = buffer; - fifo->size = size; - fifo->in = fifo->out = 0; - fifo->lock = lock; - - return fifo; -} -EXPORT_SYMBOL(kfifo_init); - -/** - * kfifo_alloc - allocates a new FIFO and its internal buffer - * @size: the size of the internal buffer to be allocated. - * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer - * - * The size will be rounded-up to a power of 2. - */ -struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) -{ - unsigned char *buffer; - struct kfifo *ret; - - /* - * round up to the next power of 2, since our 'let the indices - * wrap' tachnique works only in this case. - */ - if (size & (size - 1)) { - BUG_ON(size > 0x80000000); - size = roundup_pow_of_two(size); - } - - buffer = kmalloc(size, gfp_mask); - if (!buffer) - return ERR_PTR(-ENOMEM); - - ret = kfifo_init(buffer, size, gfp_mask, lock); - - if (IS_ERR(ret)) - kfree(buffer); - - return ret; -} -EXPORT_SYMBOL(kfifo_alloc); - -/** - * kfifo_free - frees the FIFO - * @fifo: the fifo to be freed. - */ -void kfifo_free(struct kfifo *fifo) -{ - kfree(fifo->buffer); - kfree(fifo); -} -EXPORT_SYMBOL(kfifo_free); - -/** - * __kfifo_put - puts some data into the FIFO, no locking version - * @fifo: the fifo to be used. - * @buffer: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int __kfifo_put(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) -{ - unsigned int l; - - len = min(len, fifo->size - fifo->in + fifo->out); - - /* - * Ensure that we sample the fifo->out index -before- we - * start putting bytes into the kfifo. - */ - - smp_mb(); - - /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); - - /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, buffer + l, len - l); - - /* - * Ensure that we add the bytes to the kfifo -before- - * we update the fifo->in index. - */ - - smp_wmb(); - - fifo->in += len; - - return len; -} -EXPORT_SYMBOL(__kfifo_put); - -/** - * __kfifo_get - gets some data from the FIFO, no locking version - * @fifo: the fifo to be used. - * @buffer: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int __kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) -{ - unsigned int l; - - len = min(len, fifo->in - fifo->out); - - /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. - */ - - smp_rmb(); - - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); - - /* then get the rest (if any) from the beginning of the buffer */ - memcpy(buffer + l, fifo->buffer, len - l); - - /* - * Ensure that we remove the bytes from the kfifo -before- - * we update the fifo->out index. - */ - - smp_mb(); - - fifo->out += len; - - return len; -} -EXPORT_SYMBOL(__kfifo_get); -/* - * KGDB stub. - * - * Maintainer: Jason Wessel - * - * Copyright (C) 2000-2001 VERITAS Software Corporation. - * Copyright (C) 2002-2004 Timesys Corporation - * Copyright (C) 2003-2004 Amit S. Kale - * Copyright (C) 2004 Pavel Machek - * Copyright (C) 2004-2006 Tom Rini - * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. - * Copyright (C) 2005-2008 Wind River Systems, Inc. - * Copyright (C) 2007 MontaVista Software, Inc. - * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar - * - * Contributors at various stages not listed above: - * Jason Wessel ( jason.wessel@windriver.com ) - * George Anzinger - * Anurekh Saxena (anurekh.saxena@timesys.com) - * Lake Stevens Instrument Division (Glenn Engel) - * Jim Kingdon, Cygnus Support. - * - * Original KGDB stub: David Grothe , - * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -static int kgdb_break_asap; - -#define KGDB_MAX_THREAD_QUERY 17 -struct kgdb_state { - int ex_vector; - int signo; - int err_code; - int cpu; - int pass_exception; - unsigned long thr_query; - unsigned long threadid; - long kgdb_usethreadid; - struct pt_regs *linux_regs; -}; - -static struct debuggerinfo_struct { - void *debuggerinfo; - struct task_struct *task; -} kgdb_info[NR_CPUS]; - -/** - * kgdb_connected - Is a host GDB connected to us? - */ -int kgdb_connected; -EXPORT_SYMBOL_GPL(kgdb_connected); - -/* All the KGDB handlers are installed */ -static int kgdb_io_module_registered; - -/* Guard for recursive entry */ -static int exception_level; - -static struct kgdb_io *kgdb_io_ops; -static DEFINE_SPINLOCK(kgdb_registration_lock); - -/* kgdb console driver is loaded */ -static int kgdb_con_registered; -/* determine if kgdb console output should be used */ -static int kgdb_use_con; - -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - -module_param(kgdb_use_con, int, 0644); - -/* - * Holds information about breakpoints in a kernel. These breakpoints are - * added and removed by gdb. - */ -static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { - [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } -}; - -/* - * The CPU# of the active CPU, or -1 if none: - */ -atomic_t kgdb_active = ATOMIC_INIT(-1); - -/* - * We use NR_CPUs not PERCPU, in case kgdb is used to debug early - * bootup code (which might not have percpu set up yet): - */ -static atomic_t passive_cpu_wait[NR_CPUS]; -static atomic_t cpu_in_kgdb[NR_CPUS]; -atomic_t kgdb_setting_breakpoint; - -struct task_struct *kgdb_usethread; -struct task_struct *kgdb_contthread; - -int kgdb_single_step; - -/* Our I/O buffers. */ -static char remcom_in_buffer[BUFMAX]; -static char remcom_out_buffer[BUFMAX]; - -/* Storage for the registers, in GDB format. */ -static unsigned long gdb_regs[(NUMREGBYTES + - sizeof(unsigned long) - 1) / - sizeof(unsigned long)]; - -/* to keep track of the CPU which is doing the single stepping*/ -atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); - -/* - * If you are debugging a problem where roundup (the collection of - * all other CPUs) is a problem [this should be extremely rare], - * then use the nokgdbroundup option to avoid roundup. In that case - * the other CPUs might interfere with your debugging context, so - * use this with care: - */ -static int kgdb_do_roundup = 1; - -static int __init opt_nokgdbroundup(char *str) -{ - kgdb_do_roundup = 0; - - return 0; -} - -early_param("nokgdbroundup", opt_nokgdbroundup); - -/* - * Finally, some KGDB code :-) - */ - -/* - * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: - */ -int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) -{ - int err; - - err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); - if (err) - return err; - - return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); -} - -int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) -{ - return probe_kernel_write((char *)addr, - (char *)bundle, BREAK_INSTR_SIZE); -} - -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - int err; - /* Validate setting the breakpoint and then removing it. In the - * remove fails, the kernel needs to emit a bad message because we - * are deep trouble not being able to put things back the way we - * found them. - */ - err = kgdb_arch_set_breakpoint(addr, tmp_variable); - if (err) - return err; - err = kgdb_arch_remove_breakpoint(addr, tmp_variable); - if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); - return err; -} - -unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) -{ - return instruction_pointer(regs); -} - -int __weak kgdb_arch_init(void) -{ - return 0; -} - -int __weak kgdb_skipexception(int exception, struct pt_regs *regs) -{ - return 0; -} - -void __weak -kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) -{ - return; -} - -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - -/* - * GDB remote protocol parser: - */ - -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - -/* scan for the sequence $# */ -static void get_packet(char *buffer) -{ - unsigned char checksum; - unsigned char xmitcsum; - int count; - char ch; - - do { - /* - * Spin and wait around for the start character, ignore all - * other characters: - */ - while ((ch = (kgdb_io_ops->read_char())) != '$') - /* nothing */; - - kgdb_connected = 1; - checksum = 0; - xmitcsum = -1; - - count = 0; - - /* - * now, read until a # or end of buffer is found: - */ - while (count < (BUFMAX - 1)) { - ch = kgdb_io_ops->read_char(); - if (ch == '#') - break; - checksum = checksum + ch; - buffer[count] = ch; - count = count + 1; - } - buffer[count] = 0; - - if (ch == '#') { - xmitcsum = hex(kgdb_io_ops->read_char()) << 4; - xmitcsum += hex(kgdb_io_ops->read_char()); - - if (checksum != xmitcsum) - /* failed checksum */ - kgdb_io_ops->write_char('-'); - else - /* successful transfer */ - kgdb_io_ops->write_char('+'); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - } - } while (checksum != xmitcsum); -} - -/* - * Send the packet in buffer. - * Check for gdb connection if asked for. - */ -static void put_packet(char *buffer) -{ - unsigned char checksum; - int count; - char ch; - - /* - * $#. - */ - while (1) { - kgdb_io_ops->write_char('$'); - checksum = 0; - count = 0; - - while ((ch = buffer[count])) { - kgdb_io_ops->write_char(ch); - checksum += ch; - count++; - } - - kgdb_io_ops->write_char('#'); - kgdb_io_ops->write_char(hex_asc_hi(checksum)); - kgdb_io_ops->write_char(hex_asc_lo(checksum)); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - - /* Now see what we get in reply. */ - ch = kgdb_io_ops->read_char(); - - if (ch == 3) - ch = kgdb_io_ops->read_char(); - - /* If we get an ACK, we are done. */ - if (ch == '+') - return; - - /* - * If we get the start of another packet, this means - * that GDB is attempting to reconnect. We will NAK - * the packet being sent, and stop trying to send this - * packet. - */ - if (ch == '$') { - kgdb_io_ops->write_char('-'); - if (kgdb_io_ops->flush) - kgdb_io_ops->flush(); - return; - } - } -} - -/* - * Convert the memory pointed to by mem into hex, placing result in buf. - * Return a pointer to the last char put in buf (null). May return an error. - */ -int kgdb_mem2hex(char *mem, char *buf, int count) -{ - char *tmp; - int err; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory copy. Hex conversion will work against this one. - */ - tmp = buf + count; - - err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; - } - - return err; -} - -/* - * Copy the binary array pointed to by buf into mem. Fix $, #, and - * 0x7d escaped with 0x7d. Return a pointer to the character after - * the last byte written. - */ -static int kgdb_ebin2mem(char *buf, char *mem, int count) -{ - int err = 0; - char c; - - while (count-- > 0) { - c = *buf++; - if (c == 0x7d) - c = *buf++ ^ 0x20; - - err = probe_kernel_write(mem, &c, 1); - if (err) - break; - - mem++; - } - - return err; -} - -/* - * Convert the hex array pointed to by buf into binary to be placed in mem. - * Return a pointer to the character AFTER the last byte written. - * May return an error. - */ -int kgdb_hex2mem(char *buf, char *mem, int count) -{ - char *tmp_raw; - char *tmp_hex; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory that is converted from hex. - */ - tmp_raw = buf + count * 2; - - tmp_hex = tmp_raw - 1; - while (tmp_hex >= buf) { - tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; - } - - return probe_kernel_write(mem, tmp_raw, count); -} - -/* - * While we find nice hex chars, build a long_val. - * Return number of chars processed. - */ -int kgdb_hex2long(char **ptr, unsigned long *long_val) -{ - int hex_val; - int num = 0; - int negate = 0; - - *long_val = 0; - - if (**ptr == '-') { - negate = 1; - (*ptr)++; - } - while (**ptr) { - hex_val = hex(**ptr); - if (hex_val < 0) - break; - - *long_val = (*long_val << 4) | hex_val; - num++; - (*ptr)++; - } - - if (negate) - *long_val = -*long_val; - - return num; -} - -/* Write memory due to an 'M' or 'X' packet. */ -static int write_mem_msg(int binary) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long addr; - unsigned long length; - int err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && - kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { - if (binary) - err = kgdb_ebin2mem(ptr, (char *)addr, length); - else - err = kgdb_hex2mem(ptr, (char *)addr, length); - if (err) - return err; - if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length); - return 0; - } - - return -EINVAL; -} - -static void error_packet(char *pkt, int error) -{ - error = -error; - pkt[0] = 'E'; - pkt[1] = hex_asc[(error / 10)]; - pkt[2] = hex_asc[(error % 10)]; - pkt[3] = '\0'; -} - -/* - * Thread ID accessors. We represent a flat TID space to GDB, where - * the per CPU idle threads (which under Linux all have PID 0) are - * remapped to negative TIDs. - */ - -#define BUF_THREAD_ID_SIZE 16 - -static char *pack_threadid(char *pkt, unsigned char *id) -{ - char *limit; - - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); - - return pkt; -} - -static void int_to_threadref(unsigned char *id, int value) -{ - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); -} - -static struct task_struct *getthread(struct pt_regs *regs, int tid) -{ - /* - * Non-positive TIDs are remapped to the cpu shadow information - */ - if (tid == 0 || tid == -1) - tid = -atomic_read(&kgdb_active) - 2; - if (tid < 0) { - if (kgdb_info[-tid - 2].task) - return kgdb_info[-tid - 2].task; - else - return idle_task(-tid - 2); - } - - /* - * find_task_by_pid_ns() does not take the tasklist lock anymore - * but is nicely RCU locked - hence is a pretty resilient - * thing to use: - */ - return find_task_by_pid_ns(tid, &init_pid_ns); -} - -/* - * CPU debug state control: - */ - -#ifdef CONFIG_SMP -static void kgdb_wait(struct pt_regs *regs) -{ - unsigned long flags; - int cpu; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - kgdb_info[cpu].debuggerinfo = regs; - kgdb_info[cpu].task = current; - /* - * Make sure the above info reaches the primary CPU before - * our cpu_in_kgdb[] flag setting does: - */ - smp_wmb(); - atomic_set(&cpu_in_kgdb[cpu], 1); - - /* Wait till primary CPU is done with debugging */ - while (atomic_read(&passive_cpu_wait[cpu])) - cpu_relax(); - - kgdb_info[cpu].debuggerinfo = NULL; - kgdb_info[cpu].task = NULL; - - /* fix up hardware debug registers on local cpu */ - if (arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - - /* Signal the primary CPU that we are done: */ - atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); - clocksource_touch_watchdog(); - local_irq_restore(flags); -} -#endif - -/* - * Some architectures need cache flushes when we set/clear a - * breakpoint: - */ -static void kgdb_flush_swbreak_addr(unsigned long addr) -{ - if (!CACHE_FLUSH_IS_SAFE) - return; - - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); - } - /* Force flush instruction cache if it was outside the mm */ - flush_icache_range(addr, addr + BREAK_INSTR_SIZE); -} - -/* - * SW breakpoint management: - */ -static int kgdb_activate_sw_breakpoints(void) -{ - unsigned long addr; - int error = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_SET) - continue; - - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_set_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) - return error; - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_ACTIVE; - } - return 0; -} - -static int kgdb_set_sw_break(unsigned long addr) -{ - int err = kgdb_validate_break_address(addr); - int breakno = -1; - int i; - - if (err) - return err; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) - return -EEXIST; - } - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_REMOVED && - kgdb_break[i].bpt_addr == addr) { - breakno = i; - break; - } - } - - if (breakno == -1) { - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_UNDEFINED) { - breakno = i; - break; - } - } - } - - if (breakno == -1) - return -E2BIG; - - kgdb_break[breakno].state = BP_SET; - kgdb_break[breakno].type = BP_BREAKPOINT; - kgdb_break[breakno].bpt_addr = addr; - - return 0; -} - -static int kgdb_deactivate_sw_breakpoints(void) -{ - unsigned long addr; - int error = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) - return error; - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_SET; - } - return 0; -} - -static int kgdb_remove_sw_break(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) { - kgdb_break[i].state = BP_REMOVED; - return 0; - } - } - return -ENOENT; -} - -int kgdb_isremovedbreak(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_REMOVED) && - (kgdb_break[i].bpt_addr == addr)) - return 1; - } - return 0; -} - -static int remove_all_break(void) -{ - unsigned long addr; - int error; - int i; - - /* Clear memory breakpoints. */ - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - goto setundefined; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", - addr); -setundefined: - kgdb_break[i].state = BP_UNDEFINED; - } - - /* Clear hardware breakpoints. */ - if (arch_kgdb_ops.remove_all_hw_break) - arch_kgdb_ops.remove_all_hw_break(); - - return 0; -} - -/* - * Remap normal tasks to their real PID, - * CPU shadow threads are mapped to -CPU - 2 - */ -static inline int shadow_pid(int realpid) -{ - if (realpid) - return realpid; - - return -raw_smp_processor_id() - 2; -} - -static char gdbmsgbuf[BUFMAX + 1]; - -static void kgdb_msg_write(const char *s, int len) -{ - char *bufptr; - int wcount; - int i; - - /* 'O'utput */ - gdbmsgbuf[0] = 'O'; - - /* Fill and send buffers... */ - while (len > 0) { - bufptr = gdbmsgbuf + 1; - - /* Calculate how many this time */ - if ((len << 1) > (BUFMAX - 2)) - wcount = (BUFMAX - 2) >> 1; - else - wcount = len; - - /* Pack in hex chars */ - for (i = 0; i < wcount; i++) - bufptr = pack_hex_byte(bufptr, s[i]); - *bufptr = '\0'; - - /* Move up */ - s += wcount; - len -= wcount; - - /* Write packet */ - put_packet(gdbmsgbuf); - } -} - -/* - * Return true if there is a valid kgdb I/O module. Also if no - * debugger is attached a message can be printed to the console about - * waiting for the debugger to attach. - * - * The print_wait argument is only to be true when called from inside - * the core kgdb_handle_exception, because it will wait for the - * debugger to attach. - */ -static int kgdb_io_ready(int print_wait) -{ - if (!kgdb_io_ops) - return 0; - if (kgdb_connected) - return 1; - if (atomic_read(&kgdb_setting_breakpoint)) - return 1; - if (print_wait) - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); - return 1; -} - -/* - * All the functions that start with gdb_cmd are the various - * operations to implement the handlers for the gdbserial protocol - * where KGDB is communicating with an external debugger - */ - -/* Handle the '?' status packets */ -static void gdb_cmd_status(struct kgdb_state *ks) -{ - /* - * We know that this packet is only sent - * during initial connect. So to be safe, - * we clear out our breakpoints now in case - * GDB is reconnecting. - */ - remove_all_break(); - - remcom_out_buffer[0] = 'S'; - pack_hex_byte(&remcom_out_buffer[1], ks->signo); -} - -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) -{ - struct task_struct *thread; - void *local_debuggerinfo; - int i; - - thread = kgdb_usethread; - if (!thread) { - thread = kgdb_info[ks->cpu].task; - local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; - } else { - local_debuggerinfo = NULL; - for_each_online_cpu(i) { - /* - * Try to find the task on some other - * or possibly this node if we do not - * find the matching task then we try - * to approximate the results. - */ - if (thread == kgdb_info[i].task) - local_debuggerinfo = kgdb_info[i].debuggerinfo; - } - } - - /* - * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs - * are in kgdb_wait, and thus have debuggerinfo. - */ - if (local_debuggerinfo) { - pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); - } else { - /* - * Pull stuff saved during switch_to; nothing - * else is accessible (or even particularly - * relevant). - * - * This should be enough for a stack trace. - */ - sleeping_thread_to_gdb_regs(gdb_regs, thread); - } - kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); -} - -/* Handle the 'G' set registers request */ -static void gdb_cmd_setregs(struct kgdb_state *ks) -{ - kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); - - if (kgdb_usethread && kgdb_usethread != current) { - error_packet(remcom_out_buffer, -EINVAL); - } else { - gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); - } -} - -/* Handle the 'm' memory read bytes */ -static void gdb_cmd_memread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long length; - unsigned long addr; - int err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && - kgdb_hex2long(&ptr, &length) > 0) { - err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); - } else { - error_packet(remcom_out_buffer, -EINVAL); - } -} - -/* Handle the 'M' memory write bytes */ -static void gdb_cmd_memwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(0); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -/* Handle the 'X' memory binary write bytes */ -static void gdb_cmd_binwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(1); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -/* Handle the 'D' or 'k', detach or kill packets */ -static void gdb_cmd_detachkill(struct kgdb_state *ks) -{ - int error; - - /* The detach case */ - if (remcom_in_buffer[0] == 'D') { - error = remove_all_break(); - if (error < 0) { - error_packet(remcom_out_buffer, error); - } else { - strcpy(remcom_out_buffer, "OK"); - kgdb_connected = 0; - } - put_packet(remcom_out_buffer); - } else { - /* - * Assume the kill case, with no exit code checking, - * trying to force detach the debugger: - */ - remove_all_break(); - kgdb_connected = 0; - } -} - -/* Handle the 'R' reboot packets */ -static int gdb_cmd_reboot(struct kgdb_state *ks) -{ - /* For now, only honor R0 */ - if (strcmp(remcom_in_buffer, "R0") == 0) { - printk(KERN_CRIT "Executing emergency reboot\n"); - strcpy(remcom_out_buffer, "OK"); - put_packet(remcom_out_buffer); - - /* - * Execution should not return from - * machine_emergency_restart() - */ - machine_emergency_restart(); - kgdb_connected = 0; - - return 1; - } - return 0; -} - -/* Handle the 'q' query packets */ -static void gdb_cmd_query(struct kgdb_state *ks) -{ - struct task_struct *g; - struct task_struct *p; - unsigned char thref[8]; - char *ptr; - int i; - int cpu; - int finished = 0; - - switch (remcom_in_buffer[1]) { - case 's': - case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) - break; - - i = 0; - remcom_out_buffer[0] = 'm'; - ptr = remcom_out_buffer + 1; - if (remcom_in_buffer[1] == 'f') { - /* Each cpu is a shadow thread */ - for_each_online_cpu(cpu) { - ks->thr_query = 0; - int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; - *(ptr++) = ','; - i++; - } - } - - do_each_thread(g, p) { - if (i >= ks->thr_query && !finished) { - int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; - *(ptr++) = ','; - ks->thr_query++; - if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) - finished = 1; - } - i++; - } while_each_thread(g, p); - - *(--ptr) = '\0'; - break; - - case 'C': - /* Current thread id */ - strcpy(remcom_out_buffer, "QC"); - ks->threadid = shadow_pid(current->pid); - int_to_threadref(thref, ks->threadid); - pack_threadid(remcom_out_buffer + 2, thref); - break; - case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) - break; - - ks->threadid = 0; - ptr = remcom_in_buffer + 17; - kgdb_hex2long(&ptr, &ks->threadid); - if (!getthread(ks->linux_regs, ks->threadid)) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - if ((int)ks->threadid > 0) { - kgdb_mem2hex(getthread(ks->linux_regs, - ks->threadid)->comm, - remcom_out_buffer, 16); - } else { - static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - - sprintf(tmpstr, "shadowCPU%d", - (int)(-ks->threadid - 2)); - kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); - } - break; - } -} - -/* Handle the 'H' task query packets */ -static void gdb_cmd_task(struct kgdb_state *ks) -{ - struct task_struct *thread; - char *ptr; - - switch (remcom_in_buffer[1]) { - case 'g': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_usethread = thread; - ks->kgdb_usethreadid = ks->threadid; - strcpy(remcom_out_buffer, "OK"); - break; - case 'c': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - if (!ks->threadid) { - kgdb_contthread = NULL; - } else { - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_contthread = thread; - } - strcpy(remcom_out_buffer, "OK"); - break; - } -} - -/* Handle the 'T' thread query packets */ -static void gdb_cmd_thread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - struct task_struct *thread; - - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, -EINVAL); -} - -/* Handle the 'z' or 'Z' breakpoint remove or set packets */ -static void gdb_cmd_break(struct kgdb_state *ks) -{ - /* - * Since GDB-5.3, it's been drafted that '0' is a software - * breakpoint, '1' is a hardware breakpoint, so let's do that. - */ - char *bpt_type = &remcom_in_buffer[1]; - char *ptr = &remcom_in_buffer[2]; - unsigned long addr; - unsigned long length; - int error = 0; - - if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { - /* Unsupported */ - if (*bpt_type > '4') - return; - } else { - if (*bpt_type != '0' && *bpt_type != '1') - /* Unsupported. */ - return; - } - - /* - * Test if this is a hardware breakpoint, and - * if we support it: - */ - if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) - /* Unsupported. */ - return; - - if (*(ptr++) != ',') { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (!kgdb_hex2long(&ptr, &addr)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (*(ptr++) != ',' || - !kgdb_hex2long(&ptr, &length)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - - if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') - error = kgdb_set_sw_break(addr); - else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') - error = kgdb_remove_sw_break(addr); - else if (remcom_in_buffer[0] == 'Z') - error = arch_kgdb_ops.set_hw_breakpoint(addr, - (int)length, *bpt_type - '0'); - else if (remcom_in_buffer[0] == 'z') - error = arch_kgdb_ops.remove_hw_breakpoint(addr, - (int) length, *bpt_type - '0'); - - if (error == 0) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, error); -} - -/* Handle the 'C' signal / exception passing packets */ -static int gdb_cmd_exception_pass(struct kgdb_state *ks) -{ - /* C09 == pass exception - * C15 == detach kgdb, pass exception - */ - if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'c'; - - } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'D'; - remove_all_break(); - kgdb_connected = 0; - return 1; - - } else { - error_packet(remcom_out_buffer, -EINVAL); - return 0; - } - - /* Indicate fall through */ - return -1; -} - -/* - * This function performs all gdbserial command procesing - */ -static int gdb_serial_stub(struct kgdb_state *ks) -{ - int error = 0; - int tmp; - - /* Clear the out buffer. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - - if (kgdb_connected) { - unsigned char thref[8]; - char *ptr; - - /* Reply to host that an exception has occurred */ - ptr = remcom_out_buffer; - *ptr++ = 'T'; - ptr = pack_hex_byte(ptr, ks->signo); - ptr += strlen(strcpy(ptr, "thread:")); - int_to_threadref(thref, shadow_pid(current->pid)); - ptr = pack_threadid(ptr, thref); - *ptr++ = ';'; - put_packet(remcom_out_buffer); - } - - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - - while (1) { - error = 0; - - /* Clear the out buffer. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - - get_packet(remcom_in_buffer); - - switch (remcom_in_buffer[0]) { - case '?': /* gdbserial status */ - gdb_cmd_status(ks); - break; - case 'g': /* return the value of the CPU registers */ - gdb_cmd_getregs(ks); - break; - case 'G': /* set the value of the CPU registers - return OK */ - gdb_cmd_setregs(ks); - break; - case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ - gdb_cmd_memread(ks); - break; - case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_memwrite(ks); - break; - case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_binwrite(ks); - break; - /* kill or detach. KGDB should treat this like a - * continue. - */ - case 'D': /* Debugger detach */ - case 'k': /* Debugger detach via kill */ - gdb_cmd_detachkill(ks); - goto default_handle; - case 'R': /* Reboot */ - if (gdb_cmd_reboot(ks)) - goto default_handle; - break; - case 'q': /* query command */ - gdb_cmd_query(ks); - break; - case 'H': /* task related */ - gdb_cmd_task(ks); - break; - case 'T': /* Query thread status */ - gdb_cmd_thread(ks); - break; - case 'z': /* Break point remove */ - case 'Z': /* Break point set */ - gdb_cmd_break(ks); - break; - case 'C': /* Exception passing */ - tmp = gdb_cmd_exception_pass(ks); - if (tmp > 0) - goto default_handle; - if (tmp == 0) - break; - /* Fall through on tmp < 0 */ - case 'c': /* Continue packet */ - case 's': /* Single step packet */ - if (kgdb_contthread && kgdb_contthread != current) { - /* Can't switch threads in kgdb */ - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_activate_sw_breakpoints(); - /* Fall through to default processing */ - default: -default_handle: - error = kgdb_arch_handle_exception(ks->ex_vector, - ks->signo, - ks->err_code, - remcom_in_buffer, - remcom_out_buffer, - ks->linux_regs); - /* - * Leave cmd processing on error, detach, - * kill, continue, or single step. - */ - if (error >= 0 || remcom_in_buffer[0] == 'D' || - remcom_in_buffer[0] == 'k') { - error = 0; - goto kgdb_exit; - } - - } - - /* reply to the request */ - put_packet(remcom_out_buffer); - } - -kgdb_exit: - if (ks->pass_exception) - error = 1; - return error; -} - -static int kgdb_reenter_check(struct kgdb_state *ks) -{ - unsigned long addr; - - if (atomic_read(&kgdb_active) != raw_smp_processor_id()) - return 0; - - /* Panic on recursive debugger calls: */ - exception_level++; - addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); - kgdb_deactivate_sw_breakpoints(); - - /* - * If the break point removed ok at the place exception - * occurred, try to recover and print a warning to the end - * user because the user planted a breakpoint in a place that - * KGDB needs in order to function. - */ - if (kgdb_remove_sw_break(addr) == 0) { - exception_level = 0; - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - kgdb_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); - WARN_ON_ONCE(1); - - return 1; - } - remove_all_break(); - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - - if (exception_level > 1) { - dump_stack(); - panic("Recursive entry to debugger"); - } - - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); - dump_stack(); - panic("Recursive entry to debugger"); - - return 1; -} - -/* - * kgdb_handle_exception() - main entry point from a kernel exception - * - * Locking hierarchy: - * interface locks, if any (begin_session) - * kgdb lock (kgdb_active) - */ -int -kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) -{ - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; - unsigned long flags; - int error = 0; - int i, cpu; - - ks->cpu = raw_smp_processor_id(); - ks->ex_vector = evector; - ks->signo = signo; - ks->ex_vector = evector; - ks->err_code = ecode; - ks->kgdb_usethreadid = 0; - ks->linux_regs = regs; - - if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ - -acquirelock: - /* - * Interrupts will be restored by the 'trap return' code, except when - * single stepping. - */ - local_irq_save(flags); - - cpu = raw_smp_processor_id(); - - /* - * Acquire the kgdb_active lock: - */ - while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) - cpu_relax(); - - /* - * Do not start the debugger connection on this CPU if the last - * instance of the exception handler wanted to come into the - * debugger on a different CPU via a single step - */ - if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - atomic_read(&kgdb_cpu_doing_single_step) != cpu) { - - atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); - clocksource_touch_watchdog(); - local_irq_restore(flags); - - goto acquirelock; - } - - if (!kgdb_io_ready(1)) { - error = 1; - goto kgdb_restore; /* No I/O connection, so resume the system */ - } - - /* - * Don't enter if we have hit a removed breakpoint. - */ - if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) - goto kgdb_restore; - - /* Call the I/O driver's pre_exception routine */ - if (kgdb_io_ops->pre_exception) - kgdb_io_ops->pre_exception(); - - kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; - kgdb_info[ks->cpu].task = current; - - kgdb_disable_hw_debug(ks->linux_regs); - - /* - * Get the passive CPU lock which will hold all the non-primary - * CPU in a spin state while the debugger is active - */ - if (!kgdb_single_step) { - for (i = 0; i < NR_CPUS; i++) - atomic_set(&passive_cpu_wait[i], 1); - } - - /* - * spin_lock code is good enough as a barrier so we don't - * need one here: - */ - atomic_set(&cpu_in_kgdb[ks->cpu], 1); - -#ifdef CONFIG_SMP - /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step) && kgdb_do_roundup) - kgdb_roundup_cpus(flags); -#endif - - /* - * Wait for the other CPUs to be notified and be waiting for us: - */ - for_each_online_cpu(i) { - while (!atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } - - /* - * At this point the primary processor is completely - * in the debugger and all secondary CPUs are quiescent - */ - kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); - kgdb_deactivate_sw_breakpoints(); - kgdb_single_step = 0; - kgdb_contthread = current; - exception_level = 0; - - /* Talk to debugger with gdbserial protocol */ - error = gdb_serial_stub(ks); - - /* Call the I/O driver's post_exception routine */ - if (kgdb_io_ops->post_exception) - kgdb_io_ops->post_exception(); - - kgdb_info[ks->cpu].debuggerinfo = NULL; - kgdb_info[ks->cpu].task = NULL; - atomic_set(&cpu_in_kgdb[ks->cpu], 0); - - if (!kgdb_single_step) { - for (i = NR_CPUS-1; i >= 0; i--) - atomic_set(&passive_cpu_wait[i], 0); - /* - * Wait till all the CPUs have quit - * from the debugger. - */ - for_each_online_cpu(i) { - while (atomic_read(&cpu_in_kgdb[i])) - cpu_relax(); - } - } - -kgdb_restore: - /* Free kgdb_active */ - atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); - clocksource_touch_watchdog(); - local_irq_restore(flags); - - return error; -} - -int kgdb_nmicallback(int cpu, void *regs) -{ -#ifdef CONFIG_SMP - if (!atomic_read(&cpu_in_kgdb[cpu]) && - atomic_read(&kgdb_active) != cpu && - atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { - kgdb_wait((struct pt_regs *)regs); - return 0; - } -#endif - return 1; -} - -static void kgdb_console_write(struct console *co, const char *s, - unsigned count) -{ - unsigned long flags; - - /* If we're debugging, or KGDB has not connected, don't try - * and print. */ - if (!kgdb_connected || atomic_read(&kgdb_active) != -1) - return; - - local_irq_save(flags); - kgdb_msg_write(s, count); - local_irq_restore(flags); -} - -static struct console kgdbcons = { - .name = "kgdb", - .write = kgdb_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED, - .index = -1, -}; - -#ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_gdb(int key, struct tty_struct *tty) -{ - if (!kgdb_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); - return; - } - if (!kgdb_connected) - printk(KERN_CRIT "Entering KGDB\n"); - - kgdb_breakpoint(); -} - -static struct sysrq_key_op sysrq_gdb_op = { - .handler = sysrq_handle_gdb, - .help_msg = "Gdb", - .action_msg = "GDB", -}; -#endif - -static void kgdb_register_callbacks(void) -{ - if (!kgdb_io_module_registered) { - kgdb_io_module_registered = 1; - kgdb_arch_init(); -#ifdef CONFIG_MAGIC_SYSRQ - register_sysrq_key('g', &sysrq_gdb_op); -#endif - if (kgdb_use_con && !kgdb_con_registered) { - register_console(&kgdbcons); - kgdb_con_registered = 1; - } - } -} - -static void kgdb_unregister_callbacks(void) -{ - /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any - * break exceptions at the time. - */ - if (kgdb_io_module_registered) { - kgdb_io_module_registered = 0; - kgdb_arch_exit(); -#ifdef CONFIG_MAGIC_SYSRQ - unregister_sysrq_key('g', &sysrq_gdb_op); -#endif - if (kgdb_con_registered) { - unregister_console(&kgdbcons); - kgdb_con_registered = 0; - } - } -} - -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - -/** - * kgdb_register_io_module - register KGDB IO module - * @new_kgdb_io_ops: the io ops vector - * - * Register it with the KGDB core. - */ -int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) -{ - int err; - - spin_lock(&kgdb_registration_lock); - - if (kgdb_io_ops) { - spin_unlock(&kgdb_registration_lock); - - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); - return -EBUSY; - } - - if (new_kgdb_io_ops->init) { - err = new_kgdb_io_ops->init(); - if (err) { - spin_unlock(&kgdb_registration_lock); - return err; - } - } - - kgdb_io_ops = new_kgdb_io_ops; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_kgdb_io_ops->name); - - /* Arm KGDB now. */ - kgdb_register_callbacks(); - - if (kgdb_break_asap) - kgdb_initial_breakpoint(); - - return 0; -} -EXPORT_SYMBOL_GPL(kgdb_register_io_module); - -/** - * kkgdb_unregister_io_module - unregister KGDB IO module - * @old_kgdb_io_ops: the io ops vector - * - * Unregister it with the KGDB core. - */ -void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) -{ - BUG_ON(kgdb_connected); - - /* - * KGDB is no longer able to communicate out, so - * unregister our callbacks and reset state. - */ - kgdb_unregister_callbacks(); - - spin_lock(&kgdb_registration_lock); - - WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); - kgdb_io_ops = NULL; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", - old_kgdb_io_ops->name); -} -EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); - -/** - * kgdb_breakpoint - generate breakpoint exception - * - * This function will generate a breakpoint exception. It is used at the - * beginning of a program to sync up with a debugger and can be used - * otherwise as a quick means to stop program execution and "break" into - * the debugger. - */ -void kgdb_breakpoint(void) -{ - atomic_set(&kgdb_setting_breakpoint, 1); - wmb(); /* Sync point before breakpoint */ - arch_kgdb_breakpoint(); - wmb(); /* Sync point after breakpoint */ - atomic_set(&kgdb_setting_breakpoint, 0); -} -EXPORT_SYMBOL_GPL(kgdb_breakpoint); - -static int __init opt_kgdb_wait(char *str) -{ - kgdb_break_asap = 1; - - if (kgdb_io_module_registered) - kgdb_initial_breakpoint(); - - return 0; -} - -early_param("kgdbwait", opt_kgdb_wait); -/* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell Jan 2003 -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern int max_threads; - -static struct workqueue_struct *khelper_wq; - -#ifdef CONFIG_MODULES - -/* - modprobe_path is set via /proc/sys. -*/ -char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; - -/** - * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string - * - * Load a module using the user mode module loader. The function returns - * zero on success or a negative errno code on failure. Note that a - * successful module load does not mean the module did not then unload - * and exit on an error of its own. Callers must check that the service - * they requested is now available not blindly invoke it. - * - * If module auto-loading support is disabled then this function - * becomes a no-operation. - */ -int request_module(const char *fmt, ...) -{ - va_list args; - char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; - int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ - static int kmod_loop_msg; - - va_start(args, fmt); - ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); - va_end(args); - if (ret >= MODULE_NAME_LEN) - return -ENAMETOOLONG; - - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg++ < 5) - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - atomic_dec(&kmod_concurrent); - return -ENOMEM; - } - - ret = call_usermodehelper(modprobe_path, argv, envp, 1); - atomic_dec(&kmod_concurrent); - return ret; -} -EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ - -struct subprocess_info { - struct work_struct work; - struct completion *complete; - char *path; - char **argv; - char **envp; - struct key *ring; - enum umh_wait wait; - int retval; - struct file *stdin; - void (*cleanup)(char **argv, char **envp); -}; - -/* - * This is the task which runs the usermode application - */ -static int ____call_usermodehelper(void *data) -{ - struct subprocess_info *sub_info = data; - struct key *new_session, *old_session; - int retval; - - /* Unblock all signals and set the session keyring. */ - new_session = key_get(sub_info->ring); - spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, new_session); - flush_signal_handlers(current, 1); - sigemptyset(¤t->blocked); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - key_put(old_session); - - /* Install input pipe when needed */ - if (sub_info->stdin) { - struct files_struct *f = current->files; - struct fdtable *fdt; - /* no races because files should be private here */ - sys_close(0); - fd_install(0, sub_info->stdin); - spin_lock(&f->file_lock); - fdt = files_fdtable(f); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); - spin_unlock(&f->file_lock); - - /* and disallow core files too */ - current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; - } - - /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); - - /* - * Our parent is keventd, which runs with elevated scheduling priority. - * Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); - - /* Exec failed? */ - sub_info->retval = retval; - do_exit(0); -} - -void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info->argv, info->envp); - kfree(info); -} -EXPORT_SYMBOL(call_usermodehelper_freeinfo); - -/* Keventd can't block, but this (a child) can. */ -static int wait_for_helper(void *data) -{ - struct subprocess_info *sub_info = data; - pid_t pid; - - /* Install a handler: if SIGCLD isn't handled sys_wait4 won't - * populate the status, but will return -ECHILD. */ - allow_signal(SIGCHLD); - - pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret; - - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But wait_for_helper() always runs as keventd, and put_user() - * to a kernel address works OK for kernel threads, due to their - * having an mm_segment_t which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either ____call_usermodehelper failed and the - * real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - if (sub_info->wait == UMH_NO_WAIT) - call_usermodehelper_freeinfo(sub_info); - else - complete(sub_info->complete); - return 0; -} - -/* This is run by khelper thread */ -static void __call_usermodehelper(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - pid_t pid; - enum umh_wait wait = sub_info->wait; - - /* CLONE_VFORK: wait until the usermode helper has execve'd - * successfully We need the data structures to stay around - * until that is done. */ - if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) - pid = kernel_thread(wait_for_helper, sub_info, - CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, - CLONE_VFORK | SIGCHLD); - - switch (wait) { - case UMH_NO_WAIT: - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - sub_info->retval = pid; - /* FALLTHROUGH */ - - case UMH_WAIT_EXEC: - complete(sub_info->complete); - } -} - -#ifdef CONFIG_PM -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - */ -static int usermodehelper_disabled; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_pm_callback() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_pm_callback() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) -{ - long retval; - - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic_inc(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ -#define usermodehelper_disabled 0 - -static inline void helper_lock(void) {} -static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, __call_usermodehelper); - sub_info->path = path; - sub_info->argv = argv; - sub_info->envp = envp; - - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_setkeys - set the session keys for usermode helper - * @info: a subprocess_info returned by call_usermodehelper_setup - * @session_keyring: the session keyring for the process - */ -void call_usermodehelper_setkeys(struct subprocess_info *info, - struct key *session_keyring) -{ - info->ring = session_keyring; -} -EXPORT_SYMBOL(call_usermodehelper_setkeys); - -/** - * call_usermodehelper_setcleanup - set a cleanup function - * @info: a subprocess_info returned by call_usermodehelper_setup - * @cleanup: a cleanup function - * - * The cleanup function is just befor ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -void call_usermodehelper_setcleanup(struct subprocess_info *info, - void (*cleanup)(char **argv, char **envp)) -{ - info->cleanup = cleanup; -} -EXPORT_SYMBOL(call_usermodehelper_setcleanup); - -/** - * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin - * @sub_info: a subprocess_info returned by call_usermodehelper_setup - * @filp: set to the write-end of a pipe - * - * This constructs a pipe, and sets the read end to be the stdin of the - * subprocess, and returns the write-end in *@filp. - */ -int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, - struct file **filp) -{ - struct file *f; - - f = create_write_pipe(0); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; - - f = create_read_pipe(f, 0); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info->stdin = f; - - return 0; -} -EXPORT_SYMBOL(call_usermodehelper_stdinpipe); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when -1 don't wait at all, but you get no useful error back when - * the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of keventd. - * (ie. it runs with full root capabilities). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - helper_lock(); - if (sub_info->path[0] == '\0') - goto out; - - if (!khelper_wq || usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - sub_info->complete = &done; - sub_info->wait = wait; - - queue_work(khelper_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - wait_for_completion(&done); - retval = sub_info->retval; - -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -/** - * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @filp: set to the write-end of a pipe - * - * This is a simple wrapper which executes a usermode-helper function - * with a pipe as stdin. It is implemented entirely in terms of - * lower-level call_usermodehelper_* functions. - */ -int call_usermodehelper_pipe(char *path, char **argv, char **envp, - struct file **filp) -{ - struct subprocess_info *sub_info; - int ret; - - sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL); - if (sub_info == NULL) - return -ENOMEM; - - ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) - goto out; - - return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - - out: - call_usermodehelper_freeinfo(sub_info); - return ret; -} -EXPORT_SYMBOL(call_usermodehelper_pipe); - -void __init usermodehelper_init(void) -{ - khelper_wq = create_singlethread_workqueue("khelper"); - BUG_ON(!khelper_wq); - register_pm_notifier_callback(); -} -/* - * Kernel Probes (KProbes) - * kernel/kprobes.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S Kernel - * Probes initial implementation (includes suggestions from - * Rusty Russell). - * 2004-Aug Updated by Prasanna S Panchamukhi with - * hlists and exceptions notifier as suggested by Andi Kleen. - * 2004-July Suparna Bhattacharya added jumper probes - * interface to access function arguments. - * 2004-Sep Prasanna S Panchamukhi Changed Kprobes - * exceptions notifier to be first on the priority list. - * 2005-May Hien Nguyen , Jim Keniston - * and Prasanna S Panchamukhi - * added function-return probes. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define KPROBE_HASH_BITS 6 -#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) - - -/* - * Some oddball architectures like 64bit powerpc have function descriptors - * so this must be overridable. - */ -#ifndef kprobe_lookup_name -#define kprobe_lookup_name(name, addr) \ - addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) -#endif - -static int kprobes_initialized; -static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; - -/* NOTE: change this value only with kprobe_mutex held */ -static bool kprobe_enabled; - -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; -static struct { - spinlock_t lock ____cacheline_aligned; -} kretprobe_table_locks[KPROBE_TABLE_SIZE]; - -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -{ - return &(kretprobe_table_locks[hash].lock); -} - -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {NULL} /* Terminator */ -}; - -#ifdef __ARCH_WANT_KPROBES_INSN_SLOT -/* - * kprobe->ainsn.insn points to the copy of the instruction to be - * single-stepped. x86_64, POWER4 and above have no-exec support and - * stepping on the instruction on a vmalloced/kmalloced/data page - * is a recipe for disaster - */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) - -struct kprobe_insn_page { - struct hlist_node hlist; - kprobe_opcode_t *insns; /* Page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; - int nused; - int ngarbage; -}; - -enum kprobe_slot_state { - SLOT_CLEAN = 0, - SLOT_DIRTY = 1, - SLOT_USED = 2, -}; - -static struct hlist_head kprobe_insn_pages; -static int kprobe_garbage_slots; -static int collect_garbage_slots(void); - -static int __kprobes check_safety(void) -{ - int ret = 0; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) - ret = freeze_processes(); - if (ret == 0) { - struct task_struct *p, *q; - do_each_thread(p, q) { - if (p != current && p->state == TASK_RUNNING && - p->pid != 0) { - printk("Check failed: %s is running\n",p->comm); - ret = -1; - goto loop_end; - } - } while_each_thread(p, q); - } -loop_end: - thaw_processes(); -#else - synchronize_sched(); -#endif - return ret; -} - -/** - * get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - retry: - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { - if (kip->nused < INSNS_PER_PAGE) { - int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (kip->slot_used[i] == SLOT_CLEAN) { - kip->slot_used[i] = SLOT_USED; - kip->nused++; - return kip->insns + (i * MAX_INSN_SIZE); - } - } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; - } - } - - /* If there are any garbage slots, collect it and try again. */ - if (kprobe_garbage_slots && collect_garbage_slots() == 0) { - goto retry; - } - /* All out of space. Need to allocate a new page. Use slot 0. */ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) - return NULL; - - /* - * Use module_alloc so this page is within +/- 2GB of where the - * kernel image and loaded module images reside. This is required - * so x86_64 can correctly handle the %rip-relative fixups. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); - kip->slot_used[0] = SLOT_USED; - kip->nused = 1; - kip->ngarbage = 0; - return kip->insns; -} - -/* Return 1 if all garbages are collected, otherwise 0. */ -static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) -{ - kip->slot_used[idx] = SLOT_CLEAN; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } - return 1; - } - return 0; -} - -static int __kprobes collect_garbage_slots(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos, *next; - - /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) - return -EAGAIN; - - hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { - int i; - if (kip->ngarbage == 0) - continue; - kip->ngarbage = 0; /* we will collect all garbages */ - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (kip->slot_used[i] == SLOT_DIRTY && - collect_one_slot(kip, i)) - break; - } - } - kprobe_garbage_slots = 0; - return 0; -} - -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { - if (kip->insns <= slot && - slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; - if (dirty) { - kip->slot_used[i] = SLOT_DIRTY; - kip->ngarbage++; - } else { - collect_one_slot(kip, i); - } - break; - } - } - - if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) - collect_garbage_slots(); -} -#endif - -/* We have preemption disabled.. so it is safe to use __ versions */ -static inline void set_kprobe_instance(struct kprobe *kp) -{ - __get_cpu_var(kprobe_instance) = kp; -} - -static inline void reset_kprobe_instance(void) -{ - __get_cpu_var(kprobe_instance) = NULL; -} - -/* - * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c - */ -struct kprobe __kprobes *get_kprobe(void *addr) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - - head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, node, head, hlist) { - if (p->addr == addr) - return p; - } - return NULL; -} - -/* - * Aggregate handlers for multiple kprobes support - these handlers - * take care of invoking the individual kprobe handlers on p->list - */ -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { - set_kprobe_instance(kp); - if (kp->pre_handler(kp, regs)) - return 1; - } - reset_kprobe_instance(); - } - return 0; -} - -static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { - set_kprobe_instance(kp); - kp->post_handler(kp, regs, flags); - reset_kprobe_instance(); - } - } -} - -static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) -{ - struct kprobe *cur = __get_cpu_var(kprobe_instance); - - /* - * if we faulted "during" the execution of a user specified - * probe handler, invoke just that probe's fault handler - */ - if (cur && cur->fault_handler) { - if (cur->fault_handler(cur, regs, trapnr)) - return 1; - } - return 0; -} - -static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *cur = __get_cpu_var(kprobe_instance); - int ret = 0; - - if (cur && cur->break_handler) { - if (cur->break_handler(cur, regs)) - ret = 1; - } - reset_kprobe_instance(); - return ret; -} - -/* Walks the list and increments nmissed count for multiprobe case */ -void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) -{ - struct kprobe *kp; - if (p->pre_handler != aggr_pre_handler) { - p->nmissed++; - } else { - list_for_each_entry_rcu(kp, &p->list, list) - kp->nmissed++; - } - return; -} - -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) -{ - struct kretprobe *rp = ri->rp; - - /* remove rp inst off the rprobe_inst_table */ - hlist_del(&ri->hlist); - INIT_HLIST_NODE(&ri->hlist); - if (likely(rp)) { - spin_lock(&rp->lock); - hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); - } else - /* Unregistering */ - hlist_add_head(&ri->hlist, head); -} - -void kretprobe_hash_lock(struct task_struct *tsk, - struct hlist_head **head, unsigned long *flags) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; - - *head = &kretprobe_inst_table[hash]; - hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); -} - -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) -{ - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); -} - -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; - - hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); -} - -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) -{ - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); -} - -/* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated - * with this task. These left over instances represent probed functions - * that have been called but will never return. - */ -void __kprobes kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long hash, flags = 0; - - if (unlikely(!kprobes_initialized)) - /* Early boot. kretprobe_table_locks not yet initialized. */ - return; - - hash = hash_ptr(tk, KPROBE_HASH_BITS); - head = &kretprobe_inst_table[hash]; - kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri, &empty_rp); - } - kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } -} - -static inline void free_rp_inst(struct kretprobe *rp) -{ - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; - - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } -} - -static void __kprobes cleanup_rp_inst(struct kretprobe *rp) -{ - unsigned long flags, hash; - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; - struct hlist_head *head; - - /* No race here */ - for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { - kretprobe_table_lock(hash, &flags); - head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, pos, next, head, hlist) { - if (ri->rp == rp) - ri->rp = NULL; - } - kretprobe_table_unlock(hash, &flags); - } - free_rp_inst(rp); -} - -/* - * Keep all fields in the kprobe consistent - */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) -{ - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); -} - -/* -* Add the new probe to old_p->list. Fail if this is the -* second jprobe at the address - two jprobes can't coexist -*/ -static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) -{ - if (p->break_handler) { - if (old_p->break_handler) - return -EEXIST; - list_add_tail_rcu(&p->list, &old_p->list); - old_p->break_handler = aggr_break_handler; - } else - list_add_rcu(&p->list, &old_p->list); - if (p->post_handler && !old_p->post_handler) - old_p->post_handler = aggr_post_handler; - return 0; -} - -/* - * Fill in the required fields of the "manager kprobe". Replace the - * earlier kprobe in the hlist with the manager kprobe - */ -static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) -{ - copy_kprobe(p, ap); - flush_insn_slot(ap); - ap->addr = p->addr; - ap->pre_handler = aggr_pre_handler; - ap->fault_handler = aggr_fault_handler; - if (p->post_handler) - ap->post_handler = aggr_post_handler; - if (p->break_handler) - ap->break_handler = aggr_break_handler; - - INIT_LIST_HEAD(&ap->list); - list_add_rcu(&p->list, &ap->list); - - hlist_replace_rcu(&p->hlist, &ap->hlist); -} - -/* - * This is the second or subsequent kprobe at the address - handle - * the intricacies - */ -static int __kprobes register_aggr_kprobe(struct kprobe *old_p, - struct kprobe *p) -{ - int ret = 0; - struct kprobe *ap; - - if (old_p->pre_handler == aggr_pre_handler) { - copy_kprobe(old_p, p); - ret = add_new_kprobe(old_p, p); - } else { - ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) - return -ENOMEM; - add_aggr_kprobe(ap, old_p); - copy_kprobe(ap, p); - ret = add_new_kprobe(ap, p); - } - return ret; -} - -static int __kprobes in_kprobes_functions(unsigned long addr) -{ - struct kprobe_blackpoint *kb; - - if (addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) - return -EINVAL; - /* - * If there exists a kprobe_blacklist, verify and - * fail any probe registration in the prohibited area - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } - } - return 0; -} - -/* - * If we have a symbol_name argument, look it up and add the offset field - * to it. This way, we can specify a relative address to a symbol. - */ -static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) -{ - kprobe_opcode_t *addr = p->addr; - if (p->symbol_name) { - if (addr) - return NULL; - kprobe_lookup_name(p->symbol_name, addr); - } - - if (!addr) - return NULL; - return (kprobe_opcode_t *)(((char *)addr) + p->offset); -} - -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) -{ - int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (!addr) - return -EINVAL; - p->addr = addr; - - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) - return -EINVAL; - - p->mod_refcounted = 0; - - /* - * Check if are we probing a module. - */ - probed_mod = module_text_address((unsigned long) p->addr); - if (probed_mod) { - struct module *calling_mod = module_text_address(called_from); - /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. - */ - if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) - return -EINVAL; - p->mod_refcounted = 1; - } else - probed_mod = NULL; - } - - p->nmissed = 0; - INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); - old_p = get_kprobe(p->addr); - if (old_p) { - ret = register_aggr_kprobe(old_p, p); - goto out; - } - - ret = arch_prepare_kprobe(p); - if (ret) - goto out; - - INIT_HLIST_NODE(&p->hlist); - hlist_add_head_rcu(&p->hlist, - &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - - if (kprobe_enabled) - arch_arm_kprobe(p); - -out: - mutex_unlock(&kprobe_mutex); - - if (ret && probed_mod) - module_put(probed_mod); - return ret; -} - -/* - * Unregister a kprobe without a scheduler synchronization. - */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return -EINVAL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid_p; - return -EINVAL; - } -valid_p: - if (old_p == p || - (old_p->pre_handler == aggr_pre_handler && - list_is_singular(&old_p->list))) { - /* - * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. - */ - if (kprobe_enabled) - arch_disarm_kprobe(p); - hlist_del_rcu(&old_p->hlist); - } else { - if (p->break_handler) - old_p->break_handler = NULL; - if (p->post_handler) { - list_for_each_entry_rcu(list_p, &old_p->list, list) { - if ((list_p != p) && (list_p->post_handler)) - goto noclean; - } - old_p->post_handler = NULL; - } -noclean: - list_del_rcu(&p->list); - } - return 0; -} - -static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) -{ - struct module *mod; - struct kprobe *old_p; - - if (p->mod_refcounted) { - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } - arch_remove_kprobe(p); - } -} - -static int __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) -{ - int i, ret = 0; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); - if (ret < 0) { - if (i > 0) - unregister_kprobes(kps, i); - break; - } - } - return ret; -} - -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kprobe(struct kprobe *p) -{ - unregister_kprobes(&p, 1); -} - -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kprobes(struct kprobe **kps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(kps[i]) < 0) - kps[i]->addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) - if (kps[i]->addr) - __unregister_kprobe_bottom(kps[i]); -} - -static struct notifier_block kprobe_exceptions_nb = { - .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to be notified first */ -}; - -unsigned long __weak arch_deref_entry_point(void *entry) -{ - return (unsigned long)entry; -} - -static int __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) -{ - struct jprobe *jp; - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - unsigned long addr; - jp = jps[i]; - addr = arch_deref_entry_point(jp->entry); - - if (!kernel_text_address(addr)) - ret = -EINVAL; - else { - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); - } - if (ret < 0) { - if (i > 0) - unregister_jprobes(jps, i); - break; - } - } - return ret; -} - -int __kprobes register_jprobe(struct jprobe *jp) -{ - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_jprobe(struct jprobe *jp) -{ - unregister_jprobes(&jp, 1); -} - -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_jprobes(struct jprobe **jps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&jps[i]->kp) < 0) - jps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (jps[i]->kp.addr) - __unregister_kprobe_bottom(&jps[i]->kp); - } -} - -#ifdef CONFIG_KRETPROBES -/* - * This kprobe pre_handler is registered with every kretprobe. When probe - * hits it will set up the return probe. - */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long hash, flags = 0; - struct kretprobe_instance *ri; - - /*TODO: consider to only swap the RA after the last pre_handler fired */ - hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); - if (!hlist_empty(&rp->free_instances)) { - ri = hlist_entry(rp->free_instances.first, - struct kretprobe_instance, hlist); - hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); - - ri->rp = rp; - ri->task = current; - - if (rp->entry_handler && rp->entry_handler(ri, regs)) - return 0; - - arch_prepare_kretprobe(ri, regs); - - /* XXX(hch): why is there no hlist_move_head? */ - INIT_HLIST_NODE(&ri->hlist); - kretprobe_table_lock(hash, &flags); - hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); - kretprobe_table_unlock(hash, &flags); - } else { - rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); - } - return 0; -} - -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) -{ - int ret = 0; - struct kretprobe_instance *inst; - int i; - void *addr; - - if (kretprobe_blacklist_size) { - addr = kprobe_addr(&rp->kp); - if (!addr) - return -EINVAL; - - for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - if (kretprobe_blacklist[i].addr == addr) - return -EINVAL; - } - } - - rp->kp.pre_handler = pre_handler_kretprobe; - rp->kp.post_handler = NULL; - rp->kp.fault_handler = NULL; - rp->kp.break_handler = NULL; - - /* Pre-allocate memory for max kretprobe instances */ - if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * NR_CPUS); -#else - rp->maxactive = NR_CPUS; -#endif - } - spin_lock_init(&rp->lock); - INIT_HLIST_HEAD(&rp->free_instances); - for (i = 0; i < rp->maxactive; i++) { - inst = kmalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); - if (inst == NULL) { - free_rp_inst(rp); - return -ENOMEM; - } - INIT_HLIST_NODE(&inst->hlist); - hlist_add_head(&inst->hlist, &rp->free_instances); - } - - rp->nmissed = 0; - /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); - if (ret != 0) - free_rp_inst(rp); - return ret; -} - -static int __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) -{ - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); - if (ret < 0) { - if (i > 0) - unregister_kretprobes(rps, i); - break; - } - } - return ret; -} - -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kretprobe(struct kretprobe *rp) -{ - unregister_kretprobes(&rp, 1); -} - -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&rps[i]->kp) < 0) - rps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (rps[i]->kp.addr) { - __unregister_kprobe_bottom(&rps[i]->kp); - cleanup_rp_inst(rps[i]); - } - } -} - -#else /* CONFIG_KRETPROBES */ -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return -ENOSYS; -} - -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return -ENOSYS; -} -void __kprobes unregister_kretprobe(struct kretprobe *rp) -{ -} - -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) -{ -} - -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_KRETPROBES */ - -static int __init init_kprobes(void) -{ - int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[128]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; - - /* FIXME allocate the probe table, currently defined statically */ - /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - INIT_HLIST_HEAD(&kprobe_table[i]); - INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); - } - - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; - } - - if (kretprobe_blacklist_size) { - /* lookup the function address from its name */ - for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - kprobe_lookup_name(kretprobe_blacklist[i].name, - kretprobe_blacklist[i].addr); - if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", - kretprobe_blacklist[i].name); - } - } - - /* By default, kprobes are enabled */ - kprobe_enabled = true; - - err = arch_init_kprobes(); - if (!err) - err = register_die_notifier(&kprobe_exceptions_nb); - kprobes_initialized = (err == 0); - - if (!err) - init_test_probes(); - return err; -} - -#ifdef CONFIG_DEBUG_FS -static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) -{ - char *kprobe_type; - - if (p->pre_handler == pre_handler_kretprobe) - kprobe_type = "r"; - else if (p->pre_handler == setjmp_pre_handler) - kprobe_type = "j"; - else - kprobe_type = "k"; - if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); -} - -static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) -{ - return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; -} - -static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) -{ - (*pos)++; - if (*pos >= KPROBE_TABLE_SIZE) - return NULL; - return pos; -} - -static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - -static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p, *kp; - const char *sym = NULL; - unsigned int i = *(loff_t *) v; - unsigned long offset = 0; - char *modname, namebuf[128]; - - head = &kprobe_table[i]; - preempt_disable(); - hlist_for_each_entry_rcu(p, node, head, hlist) { - sym = kallsyms_lookup((unsigned long)p->addr, NULL, - &offset, &modname, namebuf); - if (p->pre_handler == aggr_pre_handler) { - list_for_each_entry_rcu(kp, &p->list, list) - report_probe(pi, kp, sym, offset, modname); - } else - report_probe(pi, p, sym, offset, modname); - } - preempt_enable(); - return 0; -} - -static struct seq_operations kprobes_seq_ops = { - .start = kprobe_seq_start, - .next = kprobe_seq_next, - .stop = kprobe_seq_stop, - .show = show_kprobe_addr -}; - -static int __kprobes kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void __kprobes enable_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - mutex_lock(&kprobe_mutex); - - /* If kprobes are already enabled, just return */ - if (kprobe_enabled) - goto already_enabled; - - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); - } - - kprobe_enabled = true; - printk(KERN_INFO "Kprobes globally enabled\n"); - -already_enabled: - mutex_unlock(&kprobe_mutex); - return; -} - -static void __kprobes disable_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - mutex_lock(&kprobe_mutex); - - /* If kprobes are already disabled, just return */ - if (!kprobe_enabled) - goto already_disabled; - - kprobe_enabled = false; - printk(KERN_INFO "Kprobes globally disabled\n"); - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) - arch_disarm_kprobe(p); - } - } - - mutex_unlock(&kprobe_mutex); - /* Allow all currently running kprobes to complete */ - synchronize_sched(); - return; - -already_disabled: - mutex_unlock(&kprobe_mutex); - return; -} - -/* - * XXX: The debugfs bool file interface doesn't allow for callbacks - * when the bool state is switched. We can reuse that facility when - * available - */ -static ssize_t read_enabled_file_bool(struct file *file, - char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[3]; - - if (kprobe_enabled) - buf[0] = '1'; - else - buf[0] = '0'; - buf[1] = '\n'; - buf[2] = 0x00; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t write_enabled_file_bool(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[32]; - int buf_size; - - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - enable_all_kprobes(); - break; - case 'n': - case 'N': - case '0': - disable_all_kprobes(); - break; - } - - return count; -} - -static struct file_operations fops_kp = { - .read = read_enabled_file_bool, - .write = write_enabled_file_bool, -}; - -static int __kprobes debugfs_kprobe_init(void) -{ - struct dentry *dir, *file; - unsigned int value = 1; - - dir = debugfs_create_dir("kprobes", NULL); - if (!dir) - return -ENOMEM; - - file = debugfs_create_file("list", 0444, dir, NULL, - &debugfs_kprobes_operations); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } - - file = debugfs_create_file("enabled", 0600, dir, - &value, &fops_kp); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } - - return 0; -} - -late_initcall(debugfs_kprobe_init); -#endif /* CONFIG_DEBUG_FS */ - -module_init(init_kprobes); - -EXPORT_SYMBOL_GPL(register_kprobe); -EXPORT_SYMBOL_GPL(unregister_kprobe); -EXPORT_SYMBOL_GPL(register_kprobes); -EXPORT_SYMBOL_GPL(unregister_kprobes); -EXPORT_SYMBOL_GPL(register_jprobe); -EXPORT_SYMBOL_GPL(unregister_jprobe); -EXPORT_SYMBOL_GPL(register_jprobes); -EXPORT_SYMBOL_GPL(unregister_jprobes); -EXPORT_SYMBOL_GPL(jprobe_return); -EXPORT_SYMBOL_GPL(register_kretprobe); -EXPORT_SYMBOL_GPL(unregister_kretprobe); -EXPORT_SYMBOL_GPL(register_kretprobes); -EXPORT_SYMBOL_GPL(unregister_kretprobes); -/* - * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which - * are not related to any other subsystem - * - * Copyright (C) 2004 Kay Sievers - * - * This file is release under the GPLv2 - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#define KERNEL_ATTR_RO(_name) \ -static struct kobj_attribute _name##_attr = __ATTR_RO(_name) - -#define KERNEL_ATTR_RW(_name) \ -static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) - -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) -/* current uevent sequence number */ -static ssize_t uevent_seqnum_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); -} -KERNEL_ATTR_RO(uevent_seqnum); - -/* uevent helper program, used during early boo */ -static ssize_t uevent_helper_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", uevent_helper); -} -static ssize_t uevent_helper_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (count+1 > UEVENT_HELPER_PATH_LEN) - return -ENOENT; - memcpy(uevent_helper, buf, count); - uevent_helper[count] = '\0'; - if (count && uevent_helper[count-1] == '\n') - uevent_helper[count-1] = '\0'; - return count; -} -KERNEL_ATTR_RW(uevent_helper); -#endif - -#ifdef CONFIG_KEXEC -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", !!kexec_crash_image); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t vmcoreinfo_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), - (unsigned int)vmcoreinfo_max_size); -} -KERNEL_ATTR_RO(vmcoreinfo); - -#endif /* CONFIG_KEXEC */ - -/* - * Make /sys/kernel/notes give the raw contents of our kernel .notes section. - */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); -#define notes_size (&__stop_notes - &__start_notes) - -static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) -{ - memcpy(buf, &__start_notes + off, count); - return count; -} - -static struct bin_attribute notes_attr = { - .attr = { - .name = "notes", - .mode = S_IRUGO, - }, - .read = ¬es_read, -}; - -struct kobject *kernel_kobj; -EXPORT_SYMBOL_GPL(kernel_kobj); - -static struct attribute * kernel_attrs[] = { -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) - &uevent_seqnum_attr.attr, - &uevent_helper_attr.attr, -#endif -#ifdef CONFIG_KEXEC - &kexec_loaded_attr.attr, - &kexec_crash_loaded_attr.attr, - &vmcoreinfo_attr.attr, -#endif - NULL -}; - -static struct attribute_group kernel_attr_group = { - .attrs = kernel_attrs, -}; - -static int __init ksysfs_init(void) -{ - int error; - - kernel_kobj = kobject_create_and_add("kernel", NULL); - if (!kernel_kobj) { - error = -ENOMEM; - goto exit; - } - error = sysfs_create_group(kernel_kobj, &kernel_attr_group); - if (error) - goto kset_exit; - - if (notes_size > 0) { - notes_attr.size = notes_size; - error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); - if (error) - goto group_exit; - } - - /* create the /sys/kernel/uids/ directory */ - error = uids_sysfs_init(); - if (error) - goto notes_exit; - - return 0; - -notes_exit: - if (notes_size > 0) - sysfs_remove_bin_file(kernel_kobj, ¬es_attr); -group_exit: - sysfs_remove_group(kernel_kobj, &kernel_attr_group); -kset_exit: - kobject_put(kernel_kobj); -exit: - return error; -} - -core_initcall(ksysfs_init); -/* Kernel thread helper functions. - * Copyright (C) 2004 IBM Corporation, Rusty Russell. - * - * Creation is done via kthreadd, so that we get a clean environment - * even if we're invoked from userspace (think modprobe, hotplug cpu, - * etc.). - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#define KTHREAD_NICE_LEVEL (-5) - -static DEFINE_SPINLOCK(kthread_create_lock); -static LIST_HEAD(kthread_create_list); -struct task_struct *kthreadd_task; - -struct kthread_create_info -{ - /* Information passed to kthread() from kthreadd. */ - int (*threadfn)(void *data); - void *data; - struct completion started; - - /* Result passed back to kthread_create() from kthreadd. */ - struct task_struct *result; - struct completion done; - - struct list_head list; -}; - -struct kthread_stop_info -{ - struct task_struct *k; - int err; - struct completion done; -}; - -/* Thread stopping is done by setthing this var: lock serializes - * multiple kthread_stop calls. */ -static DEFINE_MUTEX(kthread_stop_lock); -static struct kthread_stop_info kthread_stop_info; - -/** - * kthread_should_stop - should this kthread return now? - * - * When someone calls kthread_stop() on your kthread, it will be woken - * and this will return true. You should then return, and your return - * value will be passed through to kthread_stop(). - */ -int kthread_should_stop(void) -{ - return (kthread_stop_info.k == current); -} -EXPORT_SYMBOL(kthread_should_stop); - -static int kthread(void *_create) -{ - struct kthread_create_info *create = _create; - int (*threadfn)(void *data); - void *data; - int ret = -EINTR; - - /* Copy data: it's on kthread's stack */ - threadfn = create->threadfn; - data = create->data; - - /* OK, tell user we're spawned, wait for stop or wakeup */ - __set_current_state(TASK_UNINTERRUPTIBLE); - complete(&create->started); - schedule(); - - if (!kthread_should_stop()) - ret = threadfn(data); - - /* It might have exited on its own, w/o kthread_stop. Check. */ - if (kthread_should_stop()) { - kthread_stop_info.err = ret; - complete(&kthread_stop_info.done); - } - return 0; -} - -static void create_kthread(struct kthread_create_info *create) -{ - int pid; - - /* We want our own signal handler (we take no signals by default). */ - pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); - if (pid < 0) { - create->result = ERR_PTR(pid); - } else { - struct sched_param param = { .sched_priority = 0 }; - wait_for_completion(&create->started); - read_lock(&tasklist_lock); - create->result = find_task_by_pid_ns(pid, &init_pid_ns); - read_unlock(&tasklist_lock); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler(create->result, SCHED_NORMAL, ¶m); - set_user_nice(create->result, KTHREAD_NICE_LEVEL); - set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); - } - complete(&create->done); -} - -/** - * kthread_create - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). - * - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM). - */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) -{ - struct kthread_create_info create; - - create.threadfn = threadfn; - create.data = data; - init_completion(&create.started); - init_completion(&create.done); - - spin_lock(&kthread_create_lock); - list_add_tail(&create.list, &kthread_create_list); - spin_unlock(&kthread_create_lock); - - wake_up_process(kthreadd_task); - wait_for_completion(&create.done); - - if (!IS_ERR(create.result)) { - va_list args; - va_start(args, namefmt); - vsnprintf(create.result->comm, sizeof(create.result->comm), - namefmt, args); - va_end(args); - } - return create.result; -} -EXPORT_SYMBOL(kthread_create); - -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - if (k->state != TASK_UNINTERRUPTIBLE) { - WARN_ON(1); - return; - } - /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k, 0); - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** - * kthread_stop - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop(struct task_struct *k) -{ - int ret; - - mutex_lock(&kthread_stop_lock); - - /* It could exit after stop_info.k set, but before wake_up_process. */ - get_task_struct(k); - - /* Must init completion *before* thread sees kthread_stop_info.k */ - init_completion(&kthread_stop_info.done); - smp_wmb(); - - /* Now set kthread_should_stop() to true, and wake it up. */ - kthread_stop_info.k = k; - wake_up_process(k); - - /* Once it dies, reset stop ptr, gather result and we're done. */ - wait_for_completion(&kthread_stop_info.done); - kthread_stop_info.k = NULL; - ret = kthread_stop_info.err; - put_task_struct(k); - mutex_unlock(&kthread_stop_lock); - - return ret; -} -EXPORT_SYMBOL(kthread_stop); - -int kthreadd(void *unused) -{ - struct task_struct *tsk = current; - - /* Setup a clean context for our children to inherit. */ - set_task_comm(tsk, "kthreadd"); - ignore_signals(tsk); - set_user_nice(tsk, KTHREAD_NICE_LEVEL); - set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); - - current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty(&kthread_create_list)) - schedule(); - __set_current_state(TASK_RUNNING); - - spin_lock(&kthread_create_lock); - while (!list_empty(&kthread_create_list)) { - struct kthread_create_info *create; - - create = list_entry(kthread_create_list.next, - struct kthread_create_info, list); - list_del_init(&create->list); - spin_unlock(&kthread_create_lock); - - create_kthread(create); - - spin_lock(&kthread_create_lock); - } - spin_unlock(&kthread_create_lock); - } - - return 0; -} -/* - * latencytop.c: Latency display infrastructure - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(latency_lock); - -#define MAXLR 128 -static struct latency_record latency_record[MAXLR]; - -int latencytop_enabled; - -void clear_all_latency_tracing(struct task_struct *p) -{ - unsigned long flags; - - if (!latencytop_enabled) - return; - - spin_lock_irqsave(&latency_lock, flags); - memset(&p->latency_record, 0, sizeof(p->latency_record)); - p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); -} - -static void clear_global_latency_tracing(void) -{ - unsigned long flags; - - spin_lock_irqsave(&latency_lock, flags); - memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); -} - -static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) -{ - int firstnonnull = MAXLR + 1; - int i; - - if (!latencytop_enabled) - return; - - /* skip kernel threads for now */ - if (!tsk->mm) - return; - - for (i = 0; i < MAXLR; i++) { - int q, same = 1; - - /* Nothing stored: */ - if (!latency_record[i].backtrace[0]) { - if (firstnonnull > i) - firstnonnull = i; - continue; - } - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - unsigned long record = lat->backtrace[q]; - - if (latency_record[i].backtrace[q] != record) { - same = 0; - break; - } - - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) - break; - } - if (same) { - latency_record[i].count++; - latency_record[i].time += lat->time; - if (lat->time > latency_record[i].max) - latency_record[i].max = lat->time; - return; - } - } - - i = firstnonnull; - if (i >= MAXLR - 1) - return; - - /* Allocted a new one: */ - memcpy(&latency_record[i], lat, sizeof(struct latency_record)); -} - -static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) -{ - struct stack_trace trace; - - memset(&trace, 0, sizeof(trace)); - trace.max_entries = LT_BACKTRACEDEPTH; - trace.entries = &lat->backtrace[0]; - trace.skip = 0; - save_stack_trace_tsk(tsk, &trace); -} - -void __sched -account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) -{ - unsigned long flags; - int i, q; - struct latency_record lat; - - if (!latencytop_enabled) - return; - - /* Long interruptible waits are generally user requested... */ - if (inter && usecs > 5000) - return; - - memset(&lat, 0, sizeof(lat)); - lat.count = 1; - lat.time = usecs; - lat.max = usecs; - store_stacktrace(tsk, &lat); - - spin_lock_irqsave(&latency_lock, flags); - - account_global_scheduler_latency(tsk, &lat); - - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT ; i++) { - struct latency_record *mylat; - int same = 1; - - mylat = &tsk->latency_record[i]; - for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - unsigned long record = lat.backtrace[q]; - - if (mylat->backtrace[q] != record) { - same = 0; - break; - } - - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) - break; - } - if (same) { - mylat->count++; - mylat->time += lat.time; - if (lat.time > mylat->max) - mylat->max = lat.time; - goto out_unlock; - } - } - - /* Allocated a new one: */ - i = tsk->latency_record_count; - memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); - -out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); -} - -static int lstats_show(struct seq_file *m, void *v) -{ - int i; - - seq_puts(m, "Latency Top version : v0.1\n"); - - for (i = 0; i < MAXLR; i++) { - if (latency_record[i].backtrace[0]) { - int q; - seq_printf(m, "%i %li %li ", - latency_record[i].count, - latency_record[i].time, - latency_record[i].max); - for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_NAME_LEN]; - char *c; - if (!latency_record[i].backtrace[q]) - break; - if (latency_record[i].backtrace[q] == ULONG_MAX) - break; - sprint_symbol(sym, latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); - } - seq_printf(m, "\n"); - } - } - return 0; -} - -static ssize_t -lstats_write(struct file *file, const char __user *buf, size_t count, - loff_t *offs) -{ - clear_global_latency_tracing(); - - return count; -} - -static int lstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, lstats_show, NULL); -} - -static struct file_operations lstats_fops = { - .open = lstats_open, - .read = seq_read, - .write = lstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_lstats_procfs(void) -{ - proc_create("latency_stats", 0644, NULL, &lstats_fops); - return 0; -} -__initcall(init_lstats_procfs); -/* - * kernel/lockdep.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * this code maps all the lock dependencies as they occur in a live kernel - * and will warn about the following classes of locking bugs: - * - * - lock inversion scenarios - * - circular lock dependencies - * - hardirq/softirq safe/unsafe locking bugs - * - * Bugs are reported even if the current locking scenario does not cause - * any deadlock at this point. - * - * I.e. if anytime in the past two locks were taken in a different order, - * even if it happened for another task, even if those were different - * locks (but of the same class as this lock), this code will detect it. - * - * Thanks to Arjan van de Ven for coming up with the initial idea of - * mapping lock dependencies runtime. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "lockdep_internals.h" - -#ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; -module_param(prove_locking, int, 0644); -#else -#define prove_locking 0 -#endif - -#ifdef CONFIG_LOCK_STAT -int lock_stat = 1; -module_param(lock_stat, int, 0644); -#else -#define lock_stat 0 -#endif - -/* - * lockdep_lock: protects the lockdep graph, the hashes and the - * class/list/hash allocators. - * - * This is one of the rare exceptions where it's justified - * to use a raw spinlock - we really dont want the spinlock - * code to recurse back into the lockdep code... - */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -static int graph_lock(void) -{ - __raw_spin_lock(&lockdep_lock); - /* - * Make sure that if another CPU detected a bug while - * walking the graph we dont change it (while the other - * CPU is busy printing out stuff with the graph lock - * dropped already) - */ - if (!debug_locks) { - __raw_spin_unlock(&lockdep_lock); - return 0; - } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; - return 1; -} - -static inline int graph_unlock(void) -{ - if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) - return DEBUG_LOCKS_WARN_ON(1); - - current->lockdep_recursion--; - __raw_spin_unlock(&lockdep_lock); - return 0; -} - -/* - * Turn lock debugging off and return with 0 if it was off already, - * and also release the graph lock: - */ -static inline int debug_locks_off_graph_unlock(void) -{ - int ret = debug_locks_off(); - - __raw_spin_unlock(&lockdep_lock); - - return ret; -} - -static int lockdep_initialized; - -unsigned long nr_list_entries; -static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; - -/* - * All data structures here are protected by the global debug_lock. - * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. - */ -unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; - -static inline struct lock_class *hlock_class(struct held_lock *hlock) -{ - if (!hlock->class_idx) { - DEBUG_LOCKS_WARN_ON(1); - return NULL; - } - return lock_classes + hlock->class_idx - 1; -} - -#ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); - -static int lock_contention_point(struct lock_class *class, unsigned long ip) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { - if (class->contention_point[i] == 0) { - class->contention_point[i] = ip; - break; - } - if (class->contention_point[i] == ip) - break; - } - - return i; -} - -static void lock_time_inc(struct lock_time *lt, s64 time) -{ - if (time > lt->max) - lt->max = time; - - if (time < lt->min || !lt->min) - lt->min = time; - - lt->total += time; - lt->nr++; -} - -static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) -{ - dst->min += src->min; - dst->max += src->max; - dst->total += src->total; - dst->nr += src->nr; -} - -struct lock_class_stats lock_stats(struct lock_class *class) -{ - struct lock_class_stats stats; - int cpu, i; - - memset(&stats, 0, sizeof(struct lock_class_stats)); - for_each_possible_cpu(cpu) { - struct lock_class_stats *pcs = - &per_cpu(lock_stats, cpu)[class - lock_classes]; - - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; - - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); - - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); - - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; - } - - return stats; -} - -void clear_lock_stats(struct lock_class *class) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct lock_class_stats *cpu_stats = - &per_cpu(lock_stats, cpu)[class - lock_classes]; - - memset(cpu_stats, 0, sizeof(struct lock_class_stats)); - } - memset(class->contention_point, 0, sizeof(class->contention_point)); -} - -static struct lock_class_stats *get_lock_stats(struct lock_class *class) -{ - return &get_cpu_var(lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(lock_stats); -} - -static void lock_release_holdtime(struct held_lock *hlock) -{ - struct lock_class_stats *stats; - s64 holdtime; - - if (!lock_stat) - return; - - holdtime = sched_clock() - hlock->holdtime_stamp; - - stats = get_lock_stats(hlock_class(hlock)); - if (hlock->read) - lock_time_inc(&stats->read_holdtime, holdtime); - else - lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); -} -#else -static inline void lock_release_holdtime(struct held_lock *hlock) -{ -} -#endif - -/* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. - */ -LIST_HEAD(all_lock_classes); - -/* - * The lockdep classes are in a hash-table as well, for fast lookup: - */ -#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) -#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) -#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) -#define classhashentry(key) (classhash_table + __classhashfn((key))) - -static struct list_head classhash_table[CLASSHASH_SIZE]; - -/* - * We put the lock dependency chains into a hash-table as well, to cache - * their existence: - */ -#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) -#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) -#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) -#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) - -static struct list_head chainhash_table[CHAINHASH_SIZE]; - -/* - * The hash key of the lock dependency chains is a hash itself too: - * it's a hash of all locks taken up to that lock, including that lock. - * It's a 64-bit hash, because it's important for the keys to be - * unique. - */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) - -void lockdep_off(void) -{ - current->lockdep_recursion++; -} - -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion--; -} - -EXPORT_SYMBOL(lockdep_on); - -/* - * Debugging switches: - */ - -#define VERBOSE 0 -#define VERY_VERBOSE 0 - -#if VERBOSE -# define HARDIRQ_VERBOSE 1 -# define SOFTIRQ_VERBOSE 1 -#else -# define HARDIRQ_VERBOSE 0 -# define SOFTIRQ_VERBOSE 0 -#endif - -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE -/* - * Quick filtering for interesting events: - */ -static int class_filter(struct lock_class *class) -{ -#if 0 - /* Example */ - if (class->name_version == 1 && - !strcmp(class->name, "lockname")) - return 1; - if (class->name_version == 1 && - !strcmp(class->name, "&struct->lockfield")) - return 1; -#endif - /* Filter everything else. 1 would be to allow everything else */ - return 0; -} -#endif - -static int verbose(struct lock_class *class) -{ -#if VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - -static int save_trace(struct stack_trace *trace) -{ - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - trace->max_entries = trace->nr_entries; - - nr_stack_trace_entries += trace->nr_entries; - - if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return 0; - - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return 0; - } - - return 1; -} - -unsigned int nr_hardirq_chains; -unsigned int nr_softirq_chains; -unsigned int nr_process_chains; -unsigned int max_lockdep_depth; -unsigned int max_recursion_depth; - -static unsigned int lockdep_dependency_gen_id; - -static bool lockdep_dependency_visit(struct lock_class *source, - unsigned int depth) -{ - if (!depth) - lockdep_dependency_gen_id++; - if (source->dep_gen_id == lockdep_dependency_gen_id) - return true; - source->dep_gen_id = lockdep_dependency_gen_id; - return false; -} - -#ifdef CONFIG_DEBUG_LOCKDEP -/* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* - * Various lockdep statistics: - */ -atomic_t chain_lookup_hits; -atomic_t chain_lookup_misses; -atomic_t hardirqs_on_events; -atomic_t hardirqs_off_events; -atomic_t redundant_hardirqs_on; -atomic_t redundant_hardirqs_off; -atomic_t softirqs_on_events; -atomic_t softirqs_off_events; -atomic_t redundant_softirqs_on; -atomic_t redundant_softirqs_off; -atomic_t nr_unused_locks; -atomic_t nr_cyclic_checks; -atomic_t nr_cyclic_check_recursions; -atomic_t nr_find_usage_forwards_checks; -atomic_t nr_find_usage_forwards_recursions; -atomic_t nr_find_usage_backwards_checks; -atomic_t nr_find_usage_backwards_recursions; -# define debug_atomic_inc(ptr) atomic_inc(ptr) -# define debug_atomic_dec(ptr) atomic_dec(ptr) -# define debug_atomic_read(ptr) atomic_read(ptr) -#else -# define debug_atomic_inc(ptr) do { } while (0) -# define debug_atomic_dec(ptr) do { } while (0) -# define debug_atomic_read(ptr) 0 -#endif - -/* - * Locking printouts: - */ - -static const char *usage_str[] = -{ - [LOCK_USED] = "initial-use ", - [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", - [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", - [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", - [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", - [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", - [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", - [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", - [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", -}; - -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) -{ - return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); -} - -void -get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) -{ - *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; - - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) - *c1 = '+'; - else - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) - *c1 = '-'; - - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) - *c2 = '+'; - else - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) - *c2 = '-'; - - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) - *c3 = '-'; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { - *c3 = '+'; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) - *c3 = '?'; - } - - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) - *c4 = '-'; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { - *c4 = '+'; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) - *c4 = '?'; - } -} - -static void print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN], c1, c2, c3, c4; - const char *name; - - get_usage_chars(class, &c1, &c2, &c3, &c4); - - name = class->name; - if (!name) { - name = __get_key_name(class->key, str); - printk(" (%s", name); - } else { - printk(" (%s", name); - if (class->name_version > 1) - printk("#%d", class->name_version); - if (class->subclass) - printk("/%d", class->subclass); - } - printk("){%c%c%c%c}", c1, c2, c3, c4); -} - -static void print_lockdep_cache(struct lockdep_map *lock) -{ - const char *name; - char str[KSYM_NAME_LEN]; - - name = lock->name; - if (!name) - name = __get_key_name(lock->key->subkeys, str); - - printk("%s", name); -} - -static void print_lock(struct held_lock *hlock) -{ - print_lock_name(hlock_class(hlock)); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); -} - -static void lockdep_print_held_locks(struct task_struct *curr) -{ - int i, depth = curr->lockdep_depth; - - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); - return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - - for (i = 0; i < depth; i++) { - printk(" #%d: ", i); - print_lock(curr->held_locks + i); - } -} - -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk all lock dependencies starting at : - */ -static void print_lock_dependencies(struct lock_class *class, int depth) -{ - struct lock_list *entry; - - if (lockdep_dependency_visit(class, depth)) - return; - - if (DEBUG_LOCKS_WARN_ON(depth >= 20)) - return; - - print_lock_class_header(class, depth); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (DEBUG_LOCKS_WARN_ON(!entry->class)) - return; - - print_lock_dependencies(entry->class, depth + 1); - - printk("%*s ... acquired at:\n",depth,""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - } -} - -static void print_kernel_version(void) -{ - printk("%s %.*s\n", init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); -} - -static int very_verbose(struct lock_class *class) -{ -#if VERY_VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Is this the address of a static object: - */ -static int static_obj(void *obj) -{ - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif - - /* - * static variable? - */ - if ((addr >= start) && (addr < end)) - return 1; - -#ifdef CONFIG_SMP - /* - * percpu var? - */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif - - /* - * module var? - */ - return is_module_address(addr); -} - -/* - * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: - */ -static int count_matching_names(struct lock_class *new_class) -{ - struct lock_class *class; - int count = 0; - - if (!new_class->name) - return 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (new_class->key - new_class->subclass == class->key) - return class->name_version; - if (class->name && !strcmp(class->name, new_class->name)) - count = max(count, class->name_version); - } - - return count + 1; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - save_stack_trace(&lockdep_init_trace); - } -#endif - - /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: - */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; - - /* - * NOTE: the class-key must be unique. For dynamic locks, a static - * lock_class_key variable is passed in through the mutex_init() - * (or spin_lock_init()) call - which acts as the key. For static - * locks we use the lock object itself as the key. - */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > - sizeof(struct lockdep_map)); - - key = lock->key->subkeys + subclass; - - hash_head = classhashentry(key); - - /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: - */ - list_for_each_entry(class, hash_head, hash_entry) { - if (class->key == key) { - WARN_ON_ONCE(class->name != lock->name); - return class; - } - } - - return NULL; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - unsigned long flags; - - class = look_up_lock_class(lock, subclass); - if (likely(class)) - return class; - - /* - * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return NULL; - } - - key = lock->key->subkeys + subclass; - hash_head = classhashentry(key); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - /* - * We have to do the hash-walk again, to avoid races - * with another CPU: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - goto out_unlock_set; - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); - return NULL; - } - raw_local_irq_restore(flags); - - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - class = lock_classes + nr_lock_classes++; - debug_atomic_inc(&nr_unused_locks); - class->key = key; - class->name = lock->name; - class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); - class->name_version = count_matching_names(class); - /* - * We use RCU's safe list-add method to make - * parallel walking of the hash-list safe: - */ - list_add_tail_rcu(&class->hash_entry, hash_head); - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); - - if (verbose(class)) { - graph_unlock(); - raw_local_irq_restore(flags); - - printk("\nnew class %p: %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - } -out_unlock_set: - graph_unlock(); - raw_local_irq_restore(flags); - - if (!subclass || force) - lock->class_cache = class; - - if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) - return NULL; - - return class; -} - -#ifdef CONFIG_PROVE_LOCKING -/* - * Allocate a lockdep entry. (assumes the graph_lock held, returns - * with NULL on failure) - */ -static struct lock_list *alloc_list_entry(void) -{ - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return NULL; - - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - return list_entries + nr_list_entries++; -} - -/* - * Add a new dependency to the head of the list: - */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, int distance) -{ - struct lock_list *entry; - /* - * Lock not present yet - get a new dependency struct and - * add it to the list: - */ - entry = alloc_list_entry(); - if (!entry) - return 0; - - if (!save_trace(&entry->trace)) - return 0; - - entry->class = this; - entry->distance = distance; - /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: - */ - list_add_tail_rcu(&entry->entry, head); - - return 1; -} - -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - * - * (to keep the stackframe of the recursive functions small we - * use these global variables, and we also mark various helper - * functions as noinline.) - */ -static struct held_lock *check_source, *check_target; - -/* - * Print a dependency chain entry (this is only done when a deadlock - * has been detected): - */ -static noinline int -print_circular_bug_entry(struct lock_list *target, unsigned int depth) -{ - if (debug_locks_silent) - return 0; - printk("\n-> #%u", depth); - print_lock_name(target->class); - printk(":\n"); - print_stack_trace(&target->trace, 6); - - return 0; -} - -/* - * When a circular dependency is detected, print the - * header first: - */ -static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth) -{ - struct task_struct *curr = current; - - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=======================================================\n"); - printk( "[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_version(); - printk( "-------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(check_source); - printk("\nbut task is already holding lock:\n"); - print_lock(check_target); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); - - print_circular_bug_entry(entry, depth); - - return 0; -} - -static noinline int print_circular_bug_tail(void) -{ - struct task_struct *curr = current; - struct lock_list this; - - if (debug_locks_silent) - return 0; - - this.class = hlock_class(check_source); - if (!save_trace(&this.trace)) - return 0; - - print_circular_bug_entry(&this, 0); - - printk("\nother info that might help us debug this:\n\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - -unsigned long __lockdep_count_forward_deps(struct lock_class *class, - unsigned int depth) -{ - struct lock_list *entry; - unsigned long ret = 1; - - if (lockdep_dependency_visit(class, depth)) - return 0; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += __lockdep_count_forward_deps(entry->class, depth + 1); - - return ret; -} - -unsigned long lockdep_count_forward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - - local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(class, 0); - __raw_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -unsigned long __lockdep_count_backward_deps(struct lock_class *class, - unsigned int depth) -{ - struct lock_list *entry; - unsigned long ret = 1; - - if (lockdep_dependency_visit(class, depth)) - return 0; - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += __lockdep_count_backward_deps(entry->class, depth + 1); - - return ret; -} - -unsigned long lockdep_count_backward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - - local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(class, 0); - __raw_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -/* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. - */ -static noinline int -check_noncircular(struct lock_class *source, unsigned int depth) -{ - struct lock_list *entry; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - debug_atomic_inc(&nr_cyclic_check_recursions); - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == hlock_class(check_target)) - return print_circular_bug_header(entry, depth+1); - debug_atomic_inc(&nr_cyclic_checks); - if (!check_noncircular(entry->class, depth+1)) - return print_circular_bug_entry(entry, depth+1); - } - return 1; -} - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) -/* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. - */ -static enum lock_usage_bit find_usage_bit; -static struct lock_class *forwards_match, *backwards_match; - -/* - * Find a node in the forwards-direction dependency sub-graph starting - * at that matches . - * - * Return 2 if such a node exists in the subgraph, and put that node - * into . - * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. - */ -static noinline int -find_usage_forwards(struct lock_class *source, unsigned int depth) -{ - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - - debug_atomic_inc(&nr_find_usage_forwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - forwards_match = source; - return 2; - } - - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - debug_atomic_inc(&nr_find_usage_forwards_recursions); - ret = find_usage_forwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; -} - -/* - * Find a node in the backwards-direction dependency sub-graph starting - * at that matches . - * - * Return 2 if such a node exists in the subgraph, and put that node - * into . - * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. - */ -static noinline int -find_usage_backwards(struct lock_class *source, unsigned int depth) -{ - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (!__raw_spin_is_locked(&lockdep_lock)) - return DEBUG_LOCKS_WARN_ON(1); - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - - debug_atomic_inc(&nr_find_usage_backwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - backwards_match = source; - return 2; - } - - if (!source && debug_locks_off_graph_unlock()) { - WARN_ON(1); - return 0; - } - - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_before, entry) { - debug_atomic_inc(&nr_find_usage_backwards_recursions); - ret = find_usage_backwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; -} - -static int -print_bad_irq_dependency(struct task_struct *curr, - struct held_lock *prev, - struct held_lock *next, - enum lock_usage_bit bit1, - enum lock_usage_bit bit2, - const char *irqclass) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n======================================================\n"); - printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", - irqclass, irqclass); - print_kernel_version(); - printk( "------------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", - curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, - curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, - curr->softirqs_enabled); - print_lock(next); - - printk("\nand this task is already holding:\n"); - print_lock(prev); - printk("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); - printk(" ->"); - print_lock_name(hlock_class(next)); - printk("\n"); - - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", - irqclass); - print_lock_name(backwards_match); - printk("\n... which became %s-irq-safe at:\n", irqclass); - - print_stack_trace(backwards_match->usage_traces + bit1, 1); - - printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_match); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); - - print_stack_trace(forwards_match->usage_traces + bit2, 1); - - printk("\nother info that might help us debug this:\n\n"); - lockdep_print_held_locks(curr); - - printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); - print_lock_dependencies(backwards_match, 0); - - printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); - print_lock_dependencies(forwards_match, 0); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - - find_usage_bit = bit_backwards; - /* fills in */ - ret = find_usage_backwards(hlock_class(prev), 0); - if (!ret || ret == 1) - return ret; - - find_usage_bit = bit_forwards; - ret = find_usage_forwards(hlock_class(next), 0); - if (!ret || ret == 1) - return ret; - /* ret == 2 */ - return print_bad_irq_dependency(curr, prev, next, - bit_backwards, bit_forwards, irqclass); -} - -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; - - /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, - LOCK_ENABLED_HARDIRQS, "hard-read")) - return 0; - - /* - * Prove that the new dependency does not connect a softirq-safe - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - /* - * Prove that the new dependency does not connect a softirq-safe-read - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - - return 1; -} - -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - -#else - -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - return 1; -} - -static inline void inc_chains(void) -{ - nr_process_chains++; -} - -#endif - -static int -print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=============================================\n"); - printk( "[ INFO: possible recursive locking detected ]\n"); - print_kernel_version(); - printk( "---------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(next); - printk("\nbut task is already holding lock:\n"); - print_lock(prev); - - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Check whether we are holding such a class already. - * - * (Note that this has to be done separately, because the graph cannot - * detect such classes of deadlocks.) - * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read - */ -static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) -{ - struct held_lock *prev; - struct held_lock *nest = NULL; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - prev = curr->held_locks + i; - - if (prev->instance == next->nest_lock) - nest = prev; - - if (hlock_class(prev) != hlock_class(next)) - continue; - - /* - * Allow read-after-read recursion of the same - * lock class (i.e. read_lock(lock)+read_lock(lock)): - */ - if ((read == 2) && prev->read) - return 2; - - /* - * We're holding the nest_lock, which serializes this lock's - * nesting behaviour. - */ - if (nest) - return 2; - - return print_deadlock_bug(curr, prev, next); - } - return 1; -} - -/* - * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: - * - * - would the adding of the -> dependency create a - * circular dependency in the graph? [== circular deadlock] - * - * - does the new prev->next dependency connect any hardirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * hardirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with hardirq contexts] - * - * - does the new prev->next dependency connect any softirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * softirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with softirq contexts] - * - * any of these scenarios could lead to a deadlock. - * - * Then if all the validations pass, we add the forwards and backwards - * dependency. - */ -static int -check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance) -{ - struct lock_list *entry; - int ret; - - /* - * Prove that the new -> dependency would not - * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at , and - * checking whether we can reach .) - * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: - */ - check_source = next; - check_target = prev; - if (!(check_noncircular(hlock_class(next), 0))) - return print_circular_bug_tail(); - - if (!check_prev_add_irq(curr, prev, next)) - return 0; - - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* - * Is the -> dependency already present? - * - * (this may occur even though this is a new chain: consider - * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 - * chains - the second one will be new, but L1 already has - * L2 added to its dependency list, due to the first chain.) - */ - list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { - if (entry->class == hlock_class(next)) { - if (distance == 1) - entry->distance = 1; - return 2; - } - } - - /* - * Ok, all validations passed, add the new lock - * to the previous lock's dependency list: - */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance); - - if (!ret) - return 0; - - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(next)->locks_before, - next->acquire_ip, distance); - if (!ret) - return 0; - - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(" => "); - print_lock_name(hlock_class(next)); - printk("\n"); - dump_stack(); - return graph_lock(); - } - return 1; -} - -/* - * Add the dependency to all directly-previous locks that are 'relevant'. - * The ones that are relevant are (in increasing distance from curr): - * all consecutive trylock entries and the final non-trylock entry - or - * the end of this context's lock-chain - whichever comes first. - */ -static int -check_prevs_add(struct task_struct *curr, struct held_lock *next) -{ - int depth = curr->lockdep_depth; - struct held_lock *hlock; - - /* - * Debugging checks. - * - * Depth must not be zero for a non-head lock: - */ - if (!depth) - goto out_bug; - /* - * At least two relevant locks must exist for this - * to be a head: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - goto out_bug; - - for (;;) { - int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next, distance)) - return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } - depth--; - /* - * End of lock-stack? - */ - if (!depth) - break; - /* - * Stop the search if we cross into another context: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - break; - } - return 1; -out_bug: - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - -unsigned long nr_lock_chains; -struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -int nr_chain_hlocks; -static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; - -struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) -{ - return lock_classes + chain_hlocks[chain->base + i]; -} - -/* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) - */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) -{ - struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); - struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; - int i, j, n, cn; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(&chain_lookup_hits); - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); - return 0; - } - } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - if (!graph_lock()) - return 0; - /* - * We have to walk the chain again locked - to avoid duplicates: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { - if (!debug_locks_off_graph_unlock()) - return 0; - - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); - return 0; - } - chain = lock_chains + nr_lock_chains++; - chain->chain_key = chain_key; - chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - hlock_next = hlock; - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) - break; - hlock_next = hlock; - } - i++; - chain->depth = curr->lockdep_depth + 1 - i; - cn = nr_chain_hlocks; - while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { - n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); - if (n == cn) - break; - cn = n; - } - if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = cn; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - } - list_add_tail_rcu(&chain->entry, hash_head); - debug_atomic_inc(&chain_lookup_misses); - inc_chains(); - - return 1; -} - -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) -{ - /* - * Trylock needs to maintain the stack of held locks, but it - * does not add new dependencies, because trylock can be done - * in any order. - * - * We look up the chain_key and do the O(N^2) check and update of - * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires - * graph_lock for us) - */ - if (!hlock->trylock && (hlock->check == 2) && - lookup_chain_cache(curr, hlock, chain_key)) { - /* - * Check whether last held lock: - * - * - is irq-safe, if this lock is irq-unsafe - * - is softirq-safe, if this lock is hardirq-unsafe - * - * And check whether the new lock's dependency graph - * could lead back to the previous lock. - * - * any of these scenarios could lead to a deadlock. If - * All validations - */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); - - if (!ret) - return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* - * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: - */ - if (!chain_head && ret != 2) - if (!check_prevs_add(curr, hlock)) - return 0; - graph_unlock(); - } else - /* after lookup_chain_cache(): */ - if (unlikely(!debug_locks)) - return 0; - - return 1; -} -#else -static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) -{ - return 1; -} -#endif - -/* - * We are building curr_chain_key incrementally, so double-check - * it from scratch, to make sure that it's done correctly: - */ -static void check_chain_key(struct task_struct *curr) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; - u64 chain_key = 0; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - if (chain_key != hlock->prev_chain_key) { - debug_locks_off(); - WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)hlock->prev_chain_key); - return; - } - id = hlock->class_idx - 1; - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return; - - if (prev_hlock && (prev_hlock->irq_context != - hlock->irq_context)) - chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); - prev_hlock = hlock; - } - if (chain_key != curr->curr_chain_key) { - debug_locks_off(); - WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)curr->curr_chain_key); - } -#endif -} - -static int -print_usage_bug(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); - print_kernel_version(); - printk( "---------------------------------\n"); - - printk("inconsistent {%s} -> {%s} usage.\n", - usage_str[prev_bit], usage_str[new_bit]); - - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); - print_lock(this); - - printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); - - print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Print out an error if an invalid bit is set: - */ -static inline int -valid_state(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) -{ - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); - return 1; -} - -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - -/* - * print irq inversion bug: - */ -static int -print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, - struct held_lock *this, int forwards, - const char *irqclass) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=========================================================\n"); - printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_version(); - printk( "---------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(this); - if (forwards) - printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); - else - printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); - print_lock_name(other); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(hlock_class(this), 0); - - printk("\nthe second lock's dependencies:\n"); - print_lock_dependencies(other, 0); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Prove that in the forwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - - find_usage_bit = bit; - /* fills in */ - ret = find_usage_forwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; - - return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); -} - -/* - * Prove that in the backwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - - find_usage_bit = bit; - /* fills in */ - ret = find_usage_backwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; - - return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); -} - -void print_irqtrace_events(struct task_struct *curr) -{ - printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); -} - -static int hardirq_verbose(struct lock_class *class) -{ -#if HARDIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int softirq_verbose(struct lock_class *class) -{ -#if SOFTIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#define STRICT_READ_CHECKS 1 - -static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - int ret = 1; - - switch(new_bit) { - case LOCK_USED_IN_HARDIRQ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_ENABLED_HARDIRQS_READ)) - return 0; - /* - * just marked it hardirq-safe, check that this lock - * took no hardirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-safe, check that this lock - * took no hardirq-unsafe-read lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS_READ, "hard-read")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_SOFTIRQ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_ENABLED_SOFTIRQS_READ)) - return 0; - /* - * just marked it softirq-safe, check that this lock - * took no softirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-safe, check that this lock - * took no softirq-unsafe-read lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_HARDIRQ_READ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) - return 0; - /* - * just marked it hardirq-read-safe, check that this lock - * took no hardirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_HARDIRQS, "hard")) - return 0; - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_USED_IN_SOFTIRQ_READ: - if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) - return 0; - /* - * just marked it softirq-read-safe, check that this lock - * took no softirq-unsafe lock in the past: - */ - if (!check_usage_forwards(curr, this, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_HARDIRQS: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - /* - * just marked it hardirq-unsafe, check that no hardirq-safe - * lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ, "hard")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-unsafe, check that no - * hardirq-safe-read lock in the system ever took - * it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ_READ, "hard-read")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_SOFTIRQS: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) - return 0; - if (!valid_state(curr, this, new_bit, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - /* - * just marked it softirq-unsafe, check that no softirq-safe - * lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ, "soft")) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-unsafe, check that no - * softirq-safe-read lock in the system ever took - * it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_HARDIRQS_READ: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it hardirq-read-unsafe, check that no - * hardirq-safe lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_HARDIRQ, "hard")) - return 0; -#endif - if (hardirq_verbose(hlock_class(this))) - ret = 2; - break; - case LOCK_ENABLED_SOFTIRQS_READ: - if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) - return 0; -#if STRICT_READ_CHECKS - /* - * just marked it softirq-read-unsafe, check that no - * softirq-safe lock in the system ever took it in the past: - */ - if (!check_usage_backwards(curr, this, - LOCK_USED_IN_SOFTIRQ, "soft")) - return 0; -#endif - if (softirq_verbose(hlock_class(this))) - ret = 2; - break; - default: - WARN_ON(1); - break; - } - - return ret; -} - -/* - * Mark all held locks with a usage bit: - */ -static int -mark_held_locks(struct task_struct *curr, int hardirq) -{ - enum lock_usage_bit usage_bit; - struct held_lock *hlock; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - if (hardirq) { - if (hlock->read) - usage_bit = LOCK_ENABLED_HARDIRQS_READ; - else - usage_bit = LOCK_ENABLED_HARDIRQS; - } else { - if (hlock->read) - usage_bit = LOCK_ENABLED_SOFTIRQS_READ; - else - usage_bit = LOCK_ENABLED_SOFTIRQS; - } - if (!mark_lock(curr, hlock, usage_bit)) - return 0; - } - - return 1; -} - -/* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - -/* - * Hardirqs will be enabled: - */ -void trace_hardirqs_on_caller(unsigned long a0) -{ - struct task_struct *curr = current; - unsigned long ip; - - time_hardirqs_on(CALLER_ADDR0, a0); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) - return; - - if (unlikely(curr->hardirqs_enabled)) { - debug_atomic_inc(&redundant_hardirqs_on); - return; - } - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - ip = (unsigned long) __builtin_return_address(0); - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) - return; - /* - * We are going to turn hardirqs on, so set the - * usage bit for all held locks: - */ - if (!mark_held_locks(curr, 1)) - return; - /* - * If we have softirqs enabled, then set the usage - * bit for all held locks. (disabled hardirqs prevented - * this bit from being set before) - */ - if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_on_events); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -/* - * Hardirqs were disabled: - */ -void trace_hardirqs_off_caller(unsigned long a0) -{ - struct task_struct *curr = current; - - time_hardirqs_off(CALLER_ADDR0, a0); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->hardirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = _RET_IP_; - curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&hardirqs_off_events); - } else - debug_atomic_inc(&redundant_hardirqs_off); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -/* - * Softirqs will be enabled: - */ -void trace_softirqs_on(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - debug_atomic_inc(&redundant_softirqs_on); - return; - } - - /* - * We'll do an OFF -> ON transition: - */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_on_events); - /* - * We are going to turn softirqs on, so set the - * usage bit for all held locks, if hardirqs are - * enabled too: - */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, 0); -} - -/* - * Softirqs were disabled: - */ -void trace_softirqs_off(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(&softirqs_off_events); - DEBUG_LOCKS_WARN_ON(!softirq_count()); - } else - debug_atomic_inc(&redundant_softirqs_off); -} - -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) -{ - /* - * If non-trylock use in a hardirq or softirq context, then - * mark the lock as used in these contexts: - */ - if (!hlock->trylock) { - if (hlock->read) { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - } else { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) - return 0; - } - } - if (!hlock->hardirqs_off) { - if (hlock->read) { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS)) - return 0; - } - } - - return 1; -} - -static int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - unsigned int depth = curr->lockdep_depth; - - /* - * Keep track of points where we cross into an interrupt context: - */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; - if (depth) { - struct held_lock *prev_hlock; - - prev_hlock = curr->held_locks + depth-1; - /* - * If we cross into another context, reset the - * hash key (this also prevents the checking and the - * adding of the dependency to 'prev'): - */ - if (prev_hlock->irq_context != hlock->irq_context) - return 1; - } - return 0; -} - -#else - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -#endif - -/* - * Mark a lock with a usage bit, and validate the state transition: - */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - unsigned int new_mask = 1 << new_bit, ret = 1; - - /* - * If already set then do not dirty the cacheline, - * nor do any checks: - */ - if (likely(hlock_class(this)->usage_mask & new_mask)) - return 1; - - if (!graph_lock()) - return 0; - /* - * Make sure we didnt race: - */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } - - hlock_class(this)->usage_mask |= new_mask; - - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) - return 0; - - switch (new_bit) { - case LOCK_USED_IN_HARDIRQ: - case LOCK_USED_IN_SOFTIRQ: - case LOCK_USED_IN_HARDIRQ_READ: - case LOCK_USED_IN_SOFTIRQ_READ: - case LOCK_ENABLED_HARDIRQS: - case LOCK_ENABLED_SOFTIRQS: - case LOCK_ENABLED_HARDIRQS_READ: - case LOCK_ENABLED_SOFTIRQS_READ: - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; - case LOCK_USED: - debug_atomic_dec(&nr_unused_locks); - break; - default: - if (!debug_locks_off_graph_unlock()) - return 0; - WARN_ON(1); - return 0; - } - - graph_unlock(); - - /* - * We must printk outside of the graph_lock: - */ - if (ret == 2) { - printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); - print_lock(this); - print_irqtrace_events(curr); - dump_stack(); - } - - return ret; -} - -/* - * Initialize a lock instance's lock-class mapping info: - */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - if (unlikely(!debug_locks)) - return; - - if (DEBUG_LOCKS_WARN_ON(!key)) - return; - if (DEBUG_LOCKS_WARN_ON(!name)) - return; - /* - * Sanity check, the lock-class key must be persistent: - */ - if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); - DEBUG_LOCKS_WARN_ON(1); - return; - } - lock->name = name; - lock->key = key; - lock->class_cache = NULL; -#ifdef CONFIG_LOCK_STAT - lock->cpu = raw_smp_processor_id(); -#endif - if (subclass) - register_lock_class(lock, subclass, 1); -} - -EXPORT_SYMBOL_GPL(lockdep_init_map); - -/* - * This gets called for every mutex_lock*()/spin_lock*() operation. - * We maintain the dependency maps and validate the locking attempt: - */ -static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct lock_class *class = NULL; - struct held_lock *hlock; - unsigned int depth, id; - int chain_head = 0; - u64 chain_key; - - if (!prove_locking) - check = 1; - - if (unlikely(!debug_locks)) - return 0; - - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - return 0; - } - - if (!subclass) - class = lock->class_cache; - /* - * Not cached yet or subclass? - */ - if (unlikely(!class)) { - class = register_lock_class(lock, subclass, 0); - if (!class) - return 0; - } - debug_atomic_inc((atomic_t *)&class->ops); - if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - } - - /* - * Add the lock to the list of currently held locks. - * (we dont increase the depth just yet, up until the - * dependency checks are done) - */ - depth = curr->lockdep_depth; - if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) - return 0; - - hlock = curr->held_locks + depth; - if (DEBUG_LOCKS_WARN_ON(!class)) - return 0; - hlock->class_idx = class - lock_classes + 1; - hlock->acquire_ip = ip; - hlock->instance = lock; - hlock->nest_lock = nest_lock; - hlock->trylock = trylock; - hlock->read = read; - hlock->check = check; - hlock->hardirqs_off = !!hardirqs_off; -#ifdef CONFIG_LOCK_STAT - hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); -#endif - - if (check == 2 && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) - return 0; - - /* - * Calculate the chain hash: it's the combined hash of all the - * lock keys along the dependency chain. We save the hash value - * at every step so that we can get the current hash easily - * after unlock. The chain hash is then used to cache dependency - * results. - * - * The 'key ID' is what is the most compact key value to drive - * the hash, not class->key. - */ - id = class - lock_classes; - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return 0; - - chain_key = curr->curr_chain_key; - if (!depth) { - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) - return 0; - chain_head = 1; - } - - hlock->prev_chain_key = chain_key; - if (separate_irq_context(curr, hlock)) { - chain_key = 0; - chain_head = 1; - } - chain_key = iterate_chain_key(chain_key, id); - - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) - return 0; - - curr->curr_chain_key = chain_key; - curr->lockdep_depth++; - check_chain_key(curr); -#ifdef CONFIG_DEBUG_LOCKDEP - if (unlikely(!debug_locks)) - return 0; -#endif - if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { - debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); - printk("turning off the locking correctness validator.\n"); - return 0; - } - - if (unlikely(curr->lockdep_depth > max_lockdep_depth)) - max_lockdep_depth = curr->lockdep_depth; - - return 1; -} - -static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n=====================================\n"); - printk( "[ BUG: bad unlock balance detected! ]\n"); - printk( "-------------------------------------\n"); - printk("%s/%d is trying to release lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Common debugging checks for both nested and non-nested unlock: - */ -static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (unlikely(!debug_locks)) - return 0; - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); - - return 1; -} - -static int -__lock_set_subclass(struct lockdep_map *lock, - unsigned int subclass, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class *class; - unsigned int depth; - int i; - - depth = curr->lockdep_depth; - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (hlock->instance == lock) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_inbalance_bug(curr, lock, ip); - -found_it: - class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; - - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) - return 0; - } - - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks in a - * potentially non-nested (out of order) manner. This is a - * relatively rare operation, as all the unlock APIs default - * to nested mode (which uses lock_release()): - */ -static int -lock_release_non_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock, *prev_hlock; - unsigned int depth; - int i; - - /* - * Check whether the lock exists in the current stack - * of held locks: - */ - depth = curr->lockdep_depth; - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (hlock->instance == lock) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_inbalance_bug(curr, lock, ip); - -found_it: - lock_release_holdtime(hlock); - - /* - * We have the right lock to unlock, 'hlock' points to it. - * Now we remove it from the stack, and add back the other - * entries (if any), recalculating the hash along the way: - */ - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) - return 0; - } - - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static int lock_release_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock; - unsigned int depth; - - /* - * Pop off the top of the lock stack: - */ - depth = curr->lockdep_depth - 1; - hlock = curr->held_locks + depth; - - /* - * Is the unlock non-nested: - */ - if (hlock->instance != lock) - return lock_release_non_nested(curr, lock, ip); - curr->lockdep_depth--; - - if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) - return 0; - - curr->curr_chain_key = hlock->prev_chain_key; - - lock_release_holdtime(hlock); - -#ifdef CONFIG_DEBUG_LOCKDEP - hlock->prev_chain_key = 0; - hlock->class_idx = 0; - hlock->acquire_ip = 0; - hlock->irq_context = 0; -#endif - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static void -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) -{ - struct task_struct *curr = current; - - if (!check_unlock(curr, lock, ip)) - return; - - if (nested) { - if (!lock_release_nested(curr, lock, ip)) - return; - } else { - if (!lock_release_non_nested(curr, lock, ip)) - return; - } - - check_chain_key(curr); -} - -/* - * Check whether we follow the irq-flags state precisely: - */ -static void check_flags(unsigned long flags) -{ -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) - if (!debug_locks) - return; - - if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-off.\n"); - } - } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-on.\n"); - } - } - - /* - * We dont accurately track softirq state in e.g. - * hardirq contexts (such as on 4KSTACKS), so only - * check if not in hardirq contexts: - */ - if (!hardirq_count()) { - if (softirq_count()) - DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - else - DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); - } - - if (!debug_locks) - print_irqtrace_events(current); -#endif -} - -void -lock_set_subclass(struct lockdep_map *lock, - unsigned int subclass, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - current->lockdep_recursion = 1; - check_flags(flags); - if (__lock_set_subclass(lock, subclass, ip)) - check_chain_key(current); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - -EXPORT_SYMBOL_GPL(lock_set_subclass); - -/* - * We are not always called with irqs disabled - do that here, - * and also avoid lockdep recursion: - */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *nest_lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - - current->lockdep_recursion = 1; - __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - -EXPORT_SYMBOL_GPL(lock_acquire); - -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lock_release(lock, nested, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - -EXPORT_SYMBOL_GPL(lock_release); - -#ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n=================================\n"); - printk( "[ BUG: bad contention detected! ]\n"); - printk( "---------------------------------\n"); - printk("%s/%d is trying to contend lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static void -__lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - int i, point; - - depth = curr->lockdep_depth; - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (hlock->instance == lock) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, ip); - return; - -found_it: - hlock->waittime_stamp = sched_clock(); - - point = lock_contention_point(hlock_class(hlock), ip); - - stats = get_lock_stats(hlock_class(hlock)); - if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[point]++; - if (lock->cpu != smp_processor_id()) - stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); -} - -static void -__lock_acquired(struct lockdep_map *lock) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - u64 now; - s64 waittime = 0; - int i, cpu; - - depth = curr->lockdep_depth; - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (hlock->instance == lock) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; - -found_it: - cpu = smp_processor_id(); - if (hlock->waittime_stamp) { - now = sched_clock(); - waittime = now - hlock->waittime_stamp; - hlock->holdtime_stamp = now; - } - - stats = get_lock_stats(hlock_class(hlock)); - if (waittime) { - if (hlock->read) - lock_time_inc(&stats->read_waittime, waittime); - else - lock_time_inc(&stats->write_waittime, waittime); - } - if (lock->cpu != cpu) - stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); - - lock->cpu = cpu; -} - -void lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lock_contended(lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_contended); - -void lock_acquired(struct lockdep_map *lock) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lock_acquired(lock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_acquired); -#endif - -/* - * Used by the testsuite, sanitize the validator state - * after a simulated failure: - */ - -void lockdep_reset(void) -{ - unsigned long flags; - int i; - - raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; - memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); - nr_hardirq_chains = 0; - nr_softirq_chains = 0; - nr_process_chains = 0; - debug_locks = 1; - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - raw_local_irq_restore(flags); -} - -static void zap_class(struct lock_class *class) -{ - int i; - - /* - * Remove all dependencies this lock is - * involved in: - */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); - } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - list_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); - - class->key = NULL; -} - -static inline int within(const void *addr, void *start, unsigned long size) -{ - return addr >= start && addr < start + size; -} - -void lockdep_free_key_range(void *start, unsigned long size) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i; - int locked; - - raw_local_irq_save(flags); - locked = graph_lock(); - - /* - * Unhash all classes that were created by this module: - */ - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); - } - } - - if (locked) - graph_unlock(); - raw_local_irq_restore(flags); -} - -void lockdep_reset_lock(struct lockdep_map *lock) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i, j; - int locked; - - raw_local_irq_save(flags); - - /* - * Remove all classes this lock might have: - */ - for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { - /* - * If the class exists we look it up and zap it: - */ - class = look_up_lock_class(lock, j); - if (class) - zap_class(class); - } - /* - * Debug check: in the end all mapped classes should - * be gone. - */ - locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { - if (debug_locks_off_graph_unlock()) - WARN_ON(1); - goto out_restore; - } - } - } - if (locked) - graph_unlock(); - -out_restore: - raw_local_irq_restore(flags); -} - -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - -void __init lockdep_info(void) -{ - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); - - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif -} - -static void -print_freed_lock_bug(struct task_struct *curr, const void *mem_from, - const void *mem_to, struct held_lock *hlock) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n=========================\n"); - printk( "[ BUG: held lock freed! ]\n"); - printk( "-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", - curr->comm, task_pid_nr(curr), mem_from, mem_to-1); - print_lock(hlock); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); -} - -static inline int not_in_range(const void* mem_from, unsigned long mem_len, - const void* lock_from, unsigned long lock_len) -{ - return lock_from + lock_len <= mem_from || - mem_from + mem_len <= lock_from; -} - -/* - * Called when kernel memory is freed (or unmapped), or if a lock - * is destroyed or reinitialized - this code checks whether there is - * any held lock in the memory range of to : - */ -void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) -{ - struct task_struct *curr = current; - struct held_lock *hlock; - unsigned long flags; - int i; - - if (unlikely(!debug_locks)) - return; - - local_irq_save(flags); - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - if (not_in_range(mem_from, mem_len, hlock->instance, - sizeof(*hlock->instance))) - continue; - - print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); - break; - } - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); - -static void print_held_locks_bug(struct task_struct *curr) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n=====================================\n"); - printk( "[ BUG: lock held at task exit time! ]\n"); - printk( "-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); -} - -void debug_check_no_locks_held(struct task_struct *task) -{ - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); -} - -void debug_show_all_locks(void) -{ - struct task_struct *g, *p; - int count = 10; - int unlock = 1; - - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - printk("\nShowing all locks held in the system:\n"); - - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - printk(" #%d", 10-count); - mdelay(200); - goto retry; - } - printk(" ignoring it.\n"); - unlock = 0; - } - if (count != 10) - printk(" locked it.\n"); - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) - continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - printk("\n"); - printk("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); -} - -EXPORT_SYMBOL_GPL(debug_show_all_locks); - -/* - * Careful: only use this function if you are sure that - * the task cannot run in parallel! - */ -void __debug_show_held_locks(struct task_struct *task) -{ - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - lockdep_print_held_locks(task); -} -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} - -EXPORT_SYMBOL_GPL(debug_show_held_locks); - -void lockdep_sys_exit(void) -{ - struct task_struct *curr = current; - - if (unlikely(curr->lockdep_depth)) { - if (!debug_locks_off()) - return; - printk("\n================================================\n"); - printk( "[ BUG: lock held when returning to user space! ]\n"); - printk( "------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", - curr->comm, curr->pid); - lockdep_print_held_locks(curr); - } -} -/* - * kernel/lockdep_proc.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Code for /proc/lockdep and /proc/lockdep_stats: - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lockdep_internals.h" - -static void *l_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct lock_class *class; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - class = m->private; - else { - class = v; - - if (class->lock_entry.next != &all_lock_classes) - class = list_entry(class->lock_entry.next, - struct lock_class, lock_entry); - else - class = NULL; - } - - return class; -} - -static void *l_start(struct seq_file *m, loff_t *pos) -{ - struct lock_class *class; - loff_t i = 0; - - if (*pos == 0) - return SEQ_START_TOKEN; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (++i == *pos) - return class; - } - return NULL; -} - -static void l_stop(struct seq_file *m, void *v) -{ -} - -static void print_name(struct seq_file *m, struct lock_class *class) -{ - char str[128]; - const char *name = class->name; - - if (!name) { - name = __get_key_name(class->key, str); - seq_printf(m, "%s", name); - } else{ - seq_printf(m, "%s", name); - if (class->name_version > 1) - seq_printf(m, "#%d", class->name_version); - if (class->subclass) - seq_printf(m, "/%d", class->subclass); - } -} - -static int l_show(struct seq_file *m, void *v) -{ - struct lock_class *class = v; - struct lock_list *entry; - char c1, c2, c3, c4; - - if (v == SEQ_START_TOKEN) { - seq_printf(m, "all lock classes:\n"); - return 0; - } - - seq_printf(m, "%p", class->key); -#ifdef CONFIG_DEBUG_LOCKDEP - seq_printf(m, " OPS:%8ld", class->ops); -#endif -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); - seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); -#endif - - get_usage_chars(class, &c1, &c2, &c3, &c4); - seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); - - seq_printf(m, ": "); - print_name(m, class); - seq_puts(m, "\n"); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class->key); - print_name(m, entry->class); - seq_puts(m, "\n"); - } - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_ops = { - .start = l_start, - .next = l_next, - .stop = l_stop, - .show = l_show, -}; - -static int lockdep_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &lockdep_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (!list_empty(&all_lock_classes)) - m->private = list_entry(all_lock_classes.next, - struct lock_class, lock_entry); - else - m->private = NULL; - } - return res; -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_PROVE_LOCKING -static void *lc_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct lock_chain *chain; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - chain = m->private; - else { - chain = v; - - if (*pos < nr_lock_chains) - chain = lock_chains + *pos; - else - chain = NULL; - } - - return chain; -} - -static void *lc_start(struct seq_file *m, loff_t *pos) -{ - if (*pos == 0) - return SEQ_START_TOKEN; - - if (*pos < nr_lock_chains) - return lock_chains + *pos; - - return NULL; -} - -static void lc_stop(struct seq_file *m, void *v) -{ -} - -static int lc_show(struct seq_file *m, void *v) -{ - struct lock_chain *chain = v; - struct lock_class *class; - int i; - - if (v == SEQ_START_TOKEN) { - seq_printf(m, "all lock chains:\n"); - return 0; - } - - seq_printf(m, "irq_context: %d\n", chain->irq_context); - - for (i = 0; i < chain->depth; i++) { - class = lock_chain_get_class(chain, i); - if (!class->key) - continue; - - seq_printf(m, "[%p] ", class->key); - print_name(m, class); - seq_puts(m, "\n"); - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_chains_ops = { - .start = lc_start, - .next = lc_next, - .stop = lc_stop, - .show = lc_show, -}; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &lockdep_chains_ops); - if (!res) { - struct seq_file *m = file->private_data; - - if (nr_lock_chains) - m->private = lock_chains; - else - m->private = NULL; - } - return res; -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* CONFIG_PROVE_LOCKING */ - -static void lockdep_stats_debug_show(struct seq_file *m) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), - hi2 = debug_atomic_read(&hardirqs_off_events), - hr1 = debug_atomic_read(&redundant_hardirqs_on), - hr2 = debug_atomic_read(&redundant_hardirqs_off), - si1 = debug_atomic_read(&softirqs_on_events), - si2 = debug_atomic_read(&softirqs_off_events), - sr1 = debug_atomic_read(&redundant_softirqs_on), - sr2 = debug_atomic_read(&redundant_softirqs_off); - - seq_printf(m, " chain lookup misses: %11u\n", - debug_atomic_read(&chain_lookup_misses)); - seq_printf(m, " chain lookup hits: %11u\n", - debug_atomic_read(&chain_lookup_hits)); - seq_printf(m, " cyclic checks: %11u\n", - debug_atomic_read(&nr_cyclic_checks)); - seq_printf(m, " cyclic-check recursions: %11u\n", - debug_atomic_read(&nr_cyclic_check_recursions)); - seq_printf(m, " find-mask forwards checks: %11u\n", - debug_atomic_read(&nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask forwards recursions: %11u\n", - debug_atomic_read(&nr_find_usage_forwards_recursions)); - seq_printf(m, " find-mask backwards checks: %11u\n", - debug_atomic_read(&nr_find_usage_backwards_checks)); - seq_printf(m, " find-mask backwards recursions:%11u\n", - debug_atomic_read(&nr_find_usage_backwards_recursions)); - - seq_printf(m, " hardirq on events: %11u\n", hi1); - seq_printf(m, " hardirq off events: %11u\n", hi2); - seq_printf(m, " redundant hardirq ons: %11u\n", hr1); - seq_printf(m, " redundant hardirq offs: %11u\n", hr2); - seq_printf(m, " softirq on events: %11u\n", si1); - seq_printf(m, " softirq off events: %11u\n", si2); - seq_printf(m, " redundant softirq ons: %11u\n", sr1); - seq_printf(m, " redundant softirq offs: %11u\n", sr2); -#endif -} - -static int lockdep_stats_show(struct seq_file *m, void *v) -{ - struct lock_class *class; - unsigned long nr_unused = 0, nr_uncategorized = 0, - nr_irq_safe = 0, nr_irq_unsafe = 0, - nr_softirq_safe = 0, nr_softirq_unsafe = 0, - nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, - nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, - nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, - nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - - if (class->usage_mask == 0) - nr_unused++; - if (class->usage_mask == LOCKF_USED) - nr_uncategorized++; - if (class->usage_mask & LOCKF_USED_IN_IRQ) - nr_irq_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQS) - nr_irq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) - nr_softirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) - nr_softirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) - nr_hardirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) - nr_hardirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) - nr_irq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) - nr_irq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) - nr_softirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) - nr_softirq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) - nr_hardirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) - nr_hardirq_read_unsafe++; - -#ifdef CONFIG_PROVE_LOCKING - sum_forward_deps += lockdep_count_forward_deps(class); -#endif - } -#ifdef CONFIG_DEBUG_LOCKDEP - DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); -#endif - seq_printf(m, " lock-classes: %11lu [max: %lu]\n", - nr_lock_classes, MAX_LOCKDEP_KEYS); - seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", - nr_list_entries, MAX_LOCKDEP_ENTRIES); - seq_printf(m, " indirect dependencies: %11lu\n", - sum_forward_deps); - - /* - * Total number of dependencies: - * - * All irq-safe locks may nest inside irq-unsafe locks, - * plus all the other known dependencies: - */ - seq_printf(m, " all direct dependencies: %11lu\n", - nr_irq_unsafe * nr_irq_safe + - nr_hardirq_unsafe * nr_hardirq_safe + - nr_list_entries); - - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - seq_printf(m, " in-hardirq chains: %11u\n", - nr_hardirq_chains); - seq_printf(m, " in-softirq chains: %11u\n", - nr_softirq_chains); -#endif - seq_printf(m, " in-process chains: %11u\n", - nr_process_chains); - seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", - nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); - seq_printf(m, " combined max dependencies: %11u\n", - (nr_hardirq_chains + 1) * - (nr_softirq_chains + 1) * - (nr_process_chains + 1) - ); - seq_printf(m, " hardirq-safe locks: %11lu\n", - nr_hardirq_safe); - seq_printf(m, " hardirq-unsafe locks: %11lu\n", - nr_hardirq_unsafe); - seq_printf(m, " softirq-safe locks: %11lu\n", - nr_softirq_safe); - seq_printf(m, " softirq-unsafe locks: %11lu\n", - nr_softirq_unsafe); - seq_printf(m, " irq-safe locks: %11lu\n", - nr_irq_safe); - seq_printf(m, " irq-unsafe locks: %11lu\n", - nr_irq_unsafe); - - seq_printf(m, " hardirq-read-safe locks: %11lu\n", - nr_hardirq_read_safe); - seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", - nr_hardirq_read_unsafe); - seq_printf(m, " softirq-read-safe locks: %11lu\n", - nr_softirq_read_safe); - seq_printf(m, " softirq-read-unsafe locks: %11lu\n", - nr_softirq_read_unsafe); - seq_printf(m, " irq-read-safe locks: %11lu\n", - nr_irq_read_safe); - seq_printf(m, " irq-read-unsafe locks: %11lu\n", - nr_irq_read_unsafe); - - seq_printf(m, " uncategorized locks: %11lu\n", - nr_uncategorized); - seq_printf(m, " unused locks: %11lu\n", - nr_unused); - seq_printf(m, " max locking depth: %11u\n", - max_lockdep_depth); - seq_printf(m, " max recursion depth: %11u\n", - max_recursion_depth); - lockdep_stats_debug_show(m); - seq_printf(m, " debug_locks: %11u\n", - debug_locks); - - return 0; -} - -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#ifdef CONFIG_LOCK_STAT - -struct lock_stat_data { - struct lock_class *class; - struct lock_class_stats stats; -}; - -struct lock_stat_seq { - struct lock_stat_data *iter; - struct lock_stat_data *iter_end; - struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; -}; - -/* - * sort on absolute number of contentions - */ -static int lock_stat_cmp(const void *l, const void *r) -{ - const struct lock_stat_data *dl = l, *dr = r; - unsigned long nl, nr; - - nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; - nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; - - return nr - nl; -} - -static void seq_line(struct seq_file *m, char c, int offset, int length) -{ - int i; - - for (i = 0; i < offset; i++) - seq_puts(m, " "); - for (i = 0; i < length; i++) - seq_printf(m, "%c", c); - seq_puts(m, "\n"); -} - -static void snprint_time(char *buf, size_t bufsiz, s64 nr) -{ - unsigned long rem; - - nr += 5; /* for display rounding */ - rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); -} - -static void seq_time(struct seq_file *m, s64 time) -{ - char num[15]; - - snprint_time(num, sizeof(num), time); - seq_printf(m, " %14s", num); -} - -static void seq_lock_time(struct seq_file *m, struct lock_time *lt) -{ - seq_printf(m, "%14lu", lt->nr); - seq_time(m, lt->min); - seq_time(m, lt->max); - seq_time(m, lt->total); -} - -static void seq_stats(struct seq_file *m, struct lock_stat_data *data) -{ - char name[39]; - struct lock_class *class; - struct lock_class_stats *stats; - int i, namelen; - - class = data->class; - stats = &data->stats; - - namelen = 38; - if (class->name_version > 1) - namelen -= 2; /* XXX truncates versions > 9 */ - if (class->subclass) - namelen -= 2; - - if (!class->name) { - char str[KSYM_NAME_LEN]; - const char *key_name; - - key_name = __get_key_name(class->key, str); - snprintf(name, namelen, "%s", key_name); - } else { - snprintf(name, namelen, "%s", class->name); - } - namelen = strlen(name); - if (class->name_version > 1) { - snprintf(name+namelen, 3, "#%d", class->name_version); - namelen += 2; - } - if (class->subclass) { - snprintf(name+namelen, 3, "/%d", class->subclass); - namelen += 2; - } - - if (stats->write_holdtime.nr) { - if (stats->read_holdtime.nr) - seq_printf(m, "%38s-W:", name); - else - seq_printf(m, "%40s:", name); - - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); - seq_lock_time(m, &stats->write_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); - seq_lock_time(m, &stats->write_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_holdtime.nr) { - seq_printf(m, "%38s-R:", name); - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); - seq_lock_time(m, &stats->read_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); - seq_lock_time(m, &stats->read_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_waittime.nr + stats->write_waittime.nr == 0) - return; - - if (stats->read_holdtime.nr) - namelen += 2; - - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { - char sym[KSYM_SYMBOL_LEN]; - char ip[32]; - - if (class->contention_point[i] == 0) - break; - - if (!i) - seq_line(m, '-', 40-namelen, namelen); - - sprint_symbol(sym, class->contention_point[i]); - snprintf(ip, sizeof(ip), "[<%p>]", - (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contention_point[i], - ip, sym); - } - if (i) { - seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); - seq_puts(m, "\n"); - } -} - -static void seq_header(struct seq_file *m) -{ - seq_printf(m, "lock_stat version 0.2\n"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " - "%14s %14s\n", - "class name", - "con-bounces", - "contentions", - "waittime-min", - "waittime-max", - "waittime-total", - "acq-bounces", - "acquisitions", - "holdtime-min", - "holdtime-max", - "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "\n"); -} - -static void *ls_start(struct seq_file *m, loff_t *pos) -{ - struct lock_stat_seq *data = m->private; - - if (*pos == 0) - return SEQ_START_TOKEN; - - data->iter = data->stats + *pos; - if (data->iter >= data->iter_end) - data->iter = NULL; - - return data->iter; -} - -static void *ls_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct lock_stat_seq *data = m->private; - - (*pos)++; - - if (v == SEQ_START_TOKEN) - data->iter = data->stats; - else { - data->iter = v; - data->iter++; - } - - if (data->iter == data->iter_end) - data->iter = NULL; - - return data->iter; -} - -static void ls_stop(struct seq_file *m, void *v) -{ -} - -static int ls_show(struct seq_file *m, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_header(m); - else - seq_stats(m, v); - - return 0; -} - -static struct seq_operations lockstat_ops = { - .start = ls_start, - .next = ls_next, - .stop = ls_stop, - .show = ls_show, -}; - -static int lock_stat_open(struct inode *inode, struct file *file) -{ - int res; - struct lock_class *class; - struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); - - if (!data) - return -ENOMEM; - - res = seq_open(file, &lockstat_ops); - if (!res) { - struct lock_stat_data *iter = data->stats; - struct seq_file *m = file->private_data; - - data->iter = iter; - list_for_each_entry(class, &all_lock_classes, lock_entry) { - iter->class = class; - iter->stats = lock_stats(class); - iter++; - } - data->iter_end = iter; - - sort(data->stats, data->iter_end - data->iter, - sizeof(struct lock_stat_data), - lock_stat_cmp, NULL); - - m->private = data; - } else - vfree(data); - - return res; -} - -static ssize_t lock_stat_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct lock_class *class; - char c; - - if (count) { - if (get_user(c, buf)) - return -EFAULT; - - if (c != '0') - return count; - - list_for_each_entry(class, &all_lock_classes, lock_entry) - clear_lock_stats(class); - } - return count; -} - -static int lock_stat_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - - vfree(seq->private); - seq->private = NULL; - return seq_release(inode, file); -} - -static const struct file_operations proc_lock_stat_operations = { - .open = lock_stat_open, - .write = lock_stat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = lock_stat_release, -}; -#endif /* CONFIG_LOCK_STAT */ - -static int __init lockdep_proc_init(void) -{ - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); -#ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); -#endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - -#ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); -#endif - - return 0; -} - -__initcall(lockdep_proc_init); - -/* - * Copyright (C) 2007 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern struct marker __start___markers[]; -extern struct marker __stop___markers[]; - -/* Set to 1 to enable marker debug output */ -static const int marker_debug; - -/* - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers and the hash table. - */ -static DEFINE_MUTEX(markers_mutex); - -/* - * Marker hash table, containing the active markers. - * Protected by module_mutex. - */ -#define MARKER_HASH_BITS 6 -#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) - -/* - * Note about RCU : - * It is used to make sure every handler has finished using its private data - * between two consecutive operation (add or remove) on a given marker. It is - * also used to delay the free of multiple probes array until a quiescent state - * is reached. - * marker entries modifications are protected by the markers_mutex. - */ -struct marker_entry { - struct hlist_node hlist; - char *format; - /* Probe wrapper */ - void (*call)(const struct marker *mdata, void *call_private, ...); - struct marker_probe_closure single; - struct marker_probe_closure *multi; - int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; - unsigned char ptype:1; - char name[0]; /* Contains name'\0'format'\0' */ -}; - -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - -/** - * __mark_empty_function - Empty probe callback - * @probe_private: probe private data - * @call_private: call site private data - * @fmt: format string - * @...: variable argument list - * - * Empty callback provided as a probe to the markers. By providing this to a - * disabled marker, we make sure the execution flow is always valid even - * though the function pointer change and the marker enabling are two distinct - * operations that modifies the execution flow of preemptible code. - */ -void __mark_empty_function(void *probe_private, void *call_private, - const char *fmt, va_list *args) -{ -} -EXPORT_SYMBOL_GPL(__mark_empty_function); - -/* - * marker_probe_cb Callback that prepares the variable argument list for probes. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Since we do not use "typical" pointer based RCU in the 1 argument case, we - * need to put a full smp_rmb() in this branch. This is why we do not use - * rcu_dereference() for the pointer read. - */ -void marker_probe_cb(const struct marker *mdata, void *call_private, ...) -{ - va_list args; - char ptype; - - /* - * preempt_disable does two things : disabling preemption to make sure - * the teardown of the callbacks can be done correctly when they are in - * modules and they insure RCU read coherency. - */ - preempt_disable(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - va_start(args, call_private); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - va_end(args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) { - va_start(args, call_private); - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - va_end(args); - } - } - preempt_enable(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb); - -/* - * marker_probe_cb Callback that does not prepare the variable argument list. - * @mdata: pointer of type struct marker - * @call_private: caller site private data - * @...: Variable argument list. - * - * Should be connected to markers "MARK_NOARGS". - */ -void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) -{ - va_list args; /* not initialized */ - char ptype; - - preempt_disable(); - ptype = mdata->ptype; - if (likely(!ptype)) { - marker_probe_func *func; - /* Must read the ptype before ptr. They are not data dependant, - * so we put an explicit smp_rmb() here. */ - smp_rmb(); - func = mdata->single.func; - /* Must read the ptr before private data. They are not data - * dependant, so we put an explicit smp_rmb() here. */ - smp_rmb(); - func(mdata->single.probe_private, call_private, mdata->format, - &args); - } else { - struct marker_probe_closure *multi; - int i; - /* - * Read mdata->ptype before mdata->multi. - */ - smp_rmb(); - multi = mdata->multi; - /* - * multi points to an array, therefore accessing the array - * depends on reading multi. However, even in this case, - * we must insure that the pointer is read _before_ the array - * data. Same as rcu_dereference, but we need a full smp_rmb() - * in the fast path, so put the explicit barrier here. - */ - smp_read_barrier_depends(); - for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, - mdata->format, &args); - } - preempt_enable(); -} -EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); - -static void free_old_closure(struct rcu_head *head) -{ - struct marker_entry *entry = container_of(head, - struct marker_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; -} - -static void debug_print_probes(struct marker_entry *entry) -{ - int i; - - if (!marker_debug) - return; - - if (!entry->ptype) { - printk(KERN_DEBUG "Single probe : %p %p\n", - entry->single.func, - entry->single.probe_private); - } else { - for (i = 0; entry->multi[i].func; i++) - printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, - entry->multi[i].func, - entry->multi[i].probe_private); - } -} - -static struct marker_probe_closure * -marker_entry_add_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0; - struct marker_probe_closure *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->multi; - if (!entry->ptype) { - if (entry->single.func == probe && - entry->single.probe_private == probe_private) - return ERR_PTR(-EBUSY); - if (entry->single.func == __mark_empty_function) { - /* 0 -> 1 probes */ - entry->single.func = probe; - entry->single.probe_private = probe_private; - entry->refcount = 1; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* 1 -> 2 probes */ - nr_probes = 1; - old = NULL; - } - } else { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe - && old[nr_probes].probe_private - == probe_private) - return ERR_PTR(-EBUSY); - } - /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), - GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (!old) - new[0] = entry->single; - else - memcpy(new, old, - nr_probes * sizeof(struct marker_probe_closure)); - new[nr_probes].func = probe; - new[nr_probes].probe_private = probe_private; - entry->refcount = nr_probes + 1; - entry->multi = new; - entry->ptype = 1; - debug_print_probes(entry); - return old; -} - -static struct marker_probe_closure * -marker_entry_remove_probe(struct marker_entry *entry, - marker_probe_func *probe, void *probe_private) -{ - int nr_probes = 0, nr_del = 0, i; - struct marker_probe_closure *old, *new; - - old = entry->multi; - - debug_print_probes(entry); - if (!entry->ptype) { - /* 0 -> N is an error */ - WARN_ON(entry->single.func == __mark_empty_function); - /* 1 -> 0 probes */ - WARN_ON(probe && entry->single.func != probe); - WARN_ON(entry->single.probe_private != probe_private); - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - debug_print_probes(entry); - return NULL; - } else { - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if ((!probe || old[nr_probes].func == probe) - && old[nr_probes].probe_private - == probe_private) - nr_del++; - } - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->single.func = __mark_empty_function; - entry->refcount = 0; - entry->ptype = 0; - } else if (nr_probes - nr_del == 1) { - /* N -> 1, (N > 1) */ - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - entry->single = old[i]; - entry->refcount = 1; - entry->ptype = 0; - } else { - int j = 0; - /* N -> M, (N > 1, M > 1) */ - /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(struct marker_probe_closure), GFP_KERNEL); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if ((probe && old[i].func != probe) || - old[i].probe_private != probe_private) - new[j++] = old[i]; - entry->refcount = nr_probes - nr_del; - entry->ptype = 1; - entry->multi = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get marker if the marker is present in the marker hash table. - * Must be called with markers_mutex held. - * Returns NULL if not present. - */ -static struct marker_entry *get_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the marker to the marker hash table. Must be called with markers_mutex - * held. - */ -static struct marker_entry *add_marker(const char *name, const char *format) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - size_t format_len = 0; - u32 hash = jhash(name, name_len-1, 0); - - if (format) - format_len = strlen(format) + 1; - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "Marker %s busy\n", name); - return ERR_PTR(-EBUSY); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - if (format) { - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - } else { - e->format = NULL; - e->call = marker_probe_cb; - } - e->single.func = __mark_empty_function; - e->single.probe_private = NULL; - e->multi = NULL; - e->ptype = 0; - e->refcount = 0; - e->rcu_pending = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the marker from the marker hash table. Must be called with mutex_lock - * held. - */ -static int remove_marker(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->single.func != __mark_empty_function) - return -EBUSY; - hlist_del(&e->hlist); - /* Make sure the call_rcu has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); - kfree(e); - return 0; -} - -/* - * Set the mark_entry format to the format found in the element. - */ -static int marker_set_format(struct marker_entry **entry, const char *format) -{ - struct marker_entry *e; - size_t name_len = strlen((*entry)->name) + 1; - size_t format_len = strlen(format) + 1; - - - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) - return -ENOMEM; - memcpy(&e->name[0], (*entry)->name, name_len); - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - e->single = (*entry)->single; - e->multi = (*entry)->multi; - e->ptype = (*entry)->ptype; - e->refcount = (*entry)->refcount; - e->rcu_pending = 0; - hlist_add_before(&e->hlist, &(*entry)->hlist); - hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); - kfree(*entry); - *entry = e; - trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); - return 0; -} - -/* - * Sets the probe callback corresponding to one marker. - */ -static int set_marker(struct marker_entry **entry, struct marker *elem, - int active) -{ - int ret; - WARN_ON(strcmp((*entry)->name, elem->name) != 0); - - if ((*entry)->format) { - if (strcmp((*entry)->format, elem->format) != 0) { - printk(KERN_NOTICE - "Format mismatch for probe %s " - "(%s), marker (%s)\n", - (*entry)->name, - (*entry)->format, - elem->format); - return -EPERM; - } - } else { - ret = marker_set_format(entry, elem->format); - if (ret) - return ret; - } - - /* - * probe_cb setup (statically known) is done here. It is - * asynchronous with the rest of execution, therefore we only - * pass from a "safe" callback (with argument) to an "unsafe" - * callback (does not set arguments). - */ - elem->call = (*entry)->call; - /* - * Sanity check : - * We only update the single probe private data when the ptr is - * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) - */ - WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private - != (*entry)->single.probe_private && - !elem->ptype); - elem->single.probe_private = (*entry)->single.probe_private; - /* - * Make sure the private data is valid when we update the - * single probe ptr. - */ - smp_wmb(); - elem->single.func = (*entry)->single.func; - /* - * We also make sure that the new probe callbacks array is consistent - * before setting a pointer to it. - */ - rcu_assign_pointer(elem->multi, (*entry)->multi); - /* - * Update the function or multi probe array pointer before setting the - * ptype. - */ - smp_wmb(); - elem->ptype = (*entry)->ptype; - elem->state = active; - - return 0; -} - -/* - * Disable a marker and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. - */ -static void disable_marker(struct marker *elem) -{ - /* leave "call" as is. It is known statically. */ - elem->state = 0; - elem->single.func = __mark_empty_function; - /* Update the function before setting the ptype */ - smp_wmb(); - elem->ptype = 0; /* single probe */ - /* - * Leave the private data and id there, because removal is racy and - * should be done only after an RCU period. These are never used until - * the next initialization anyway. - */ -} - -/** - * marker_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of markers. - */ -void marker_update_probe_range(struct marker *begin, - struct marker *end) -{ - struct marker *iter; - struct marker_entry *mark_entry; - - mutex_lock(&markers_mutex); - for (iter = begin; iter < end; iter++) { - mark_entry = get_marker(iter->name); - if (mark_entry) { - set_marker(&mark_entry, iter, - !!mark_entry->refcount); - /* - * ignore error, continue - */ - } else { - disable_marker(iter); - } - } - mutex_unlock(&markers_mutex); -} - -/* - * Update probes, removing the faulty probes. - * - * Internal callback only changed before the first probe is connected to it. - * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 - * transitions. All other transitions will leave the old private data valid. - * This makes the non-atomicity of the callback/private data updates valid. - * - * "special case" updates : - * 0 -> 1 callback - * 1 -> 0 callback - * 1 -> 2 callbacks - * 2 -> 1 callbacks - * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. - * Site effect : marker_set_format may delete the marker entry (creating a - * replacement). - */ -static void marker_update_probes(void) -{ - /* Core kernel markers */ - marker_update_probe_range(__start___markers, __stop___markers); - /* Markers in modules. */ - module_update_markers(); -} - -/** - * marker_probe_register - Connect a probe to a marker - * @name: marker name - * @format: format string - * @probe: probe handler - * @probe_private: probe private data - * - * private data must be a valid allocated memory address, or NULL. - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - entry = add_marker(name, format); - if (IS_ERR(entry)) { - ret = PTR_ERR(entry); - goto end; - } - } - /* - * If we detect that a call_rcu is pending for this marker, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_add_probe(entry, probe, probe_private); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ - mutex_lock(&markers_mutex); - entry = get_marker(name); - WARN_ON(!entry); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_register); - -/** - * marker_probe_unregister - Disconnect a probe from a marker - * @name: marker name - * @probe: probe function pointer - * @probe_private: probe private data - * - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister(const char *name, - marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - struct marker_probe_closure *old; - int ret = -ENOENT; - - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, probe, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ - mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) - goto end; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(name); /* Ignore busy error message */ - ret = 0; -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister); - -static struct marker_entry * -get_marker_from_private_data(marker_probe_func *probe, void *probe_private) -{ - struct marker_entry *entry; - unsigned int i; - struct hlist_head *head; - struct hlist_node *node; - - for (i = 0; i < MARKER_TABLE_SIZE; i++) { - head = &marker_table[i]; - hlist_for_each_entry(entry, node, head, hlist) { - if (!entry->ptype) { - if (entry->single.func == probe - && entry->single.probe_private - == probe_private) - return entry; - } else { - struct marker_probe_closure *closure; - closure = entry->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func == probe && - closure[i].probe_private - == probe_private) - return entry; - } - } - } - } - return NULL; -} - -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @probe: probe function - * @probe_private: probe private data - * - * Unregister a probe by providing the registered private data. - * Only removes the first marker found in hash table. - * Return 0 on success or error value. - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int marker_probe_unregister_private_data(marker_probe_func *probe, - void *probe_private) -{ - struct marker_entry *entry; - int ret = 0; - struct marker_probe_closure *old; - - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - if (!entry) { - ret = -ENOENT; - goto end; - } - if (entry->rcu_pending) - rcu_barrier_sched(); - old = marker_entry_remove_probe(entry, NULL, probe_private); - mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ - mutex_lock(&markers_mutex); - entry = get_marker_from_private_data(probe, probe_private); - WARN_ON(!entry); - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); - remove_marker(entry->name); /* Ignore busy error message */ -end: - mutex_unlock(&markers_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); - -/** - * marker_get_private_data - Get a marker's probe private data - * @name: marker name - * @probe: probe to match - * @num: get the nth matching probe's private data - * - * Returns the nth private data pointer (starting from 0) matching, or an - * ERR_PTR. - * Returns the private data pointer, or an ERR_PTR. - * The private data pointer should _only_ be dereferenced if the caller is the - * owner of the data, or its content could vanish. This is mostly used to - * confirm that a caller is the owner of a registered probe. - */ -void *marker_get_private_data(const char *name, marker_probe_func *probe, - int num) -{ - struct hlist_head *head; - struct hlist_node *node; - struct marker_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - int i; - - head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - if (!e->ptype) { - if (num == 0 && e->single.func == probe) - return e->single.probe_private; - else - break; - } else { - struct marker_probe_closure *closure; - int match = 0; - closure = e->multi; - for (i = 0; closure[i].func; i++) { - if (closure[i].func != probe) - continue; - if (match++ == num) - return closure[i].probe_private; - } - } - } - } - return ERR_PTR(-ENOENT); -} -EXPORT_SYMBOL_GPL(marker_get_private_data); -/* - Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt , a...) -#endif - -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - -/* List of modules, protected by module_mutex or preempt_disable - * (add/delete uses stop_machine). */ -static DEFINE_MUTEX(module_mutex); -static LIST_HEAD(modules); - -/* Waiting for a module to finish initializing? */ -static DECLARE_WAIT_QUEUE_HEAD(module_wq); - -static BLOCKING_NOTIFIER_HEAD(module_notify_list); - -/* Bounds of module allocation, for speeding __module_text_address */ -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -int register_module_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_register(&module_notify_list, nb); -} -EXPORT_SYMBOL(register_module_notifier); - -int unregister_module_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_unregister(&module_notify_list, nb); -} -EXPORT_SYMBOL(unregister_module_notifier); - -/* We require a truly strong try_module_get(): 0 means failure due to - ongoing or failed initialization etc. */ -static inline int strong_try_module_get(struct module *mod) -{ - if (mod && mod->state == MODULE_STATE_COMING) - return -EBUSY; - if (try_module_get(mod)) - return 0; - else - return -ENOENT; -} - -static inline void add_taint_module(struct module *mod, unsigned flag) -{ - add_taint(flag); - mod->taints |= flag; -} - -/* - * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. nfsd and lockd use this. - */ -void __module_put_and_exit(struct module *mod, long code) -{ - module_put(mod); - do_exit(code); -} -EXPORT_SYMBOL(__module_put_and_exit); - -/* Find a module section: 0 means not found. */ -static unsigned int find_sec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, - const char *name) -{ - unsigned int i; - - for (i = 1; i < hdr->e_shnum; i++) - /* Alloc bit cleared means "ignore it." */ - if ((sechdrs[i].sh_flags & SHF_ALLOC) - && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) - return i; - return 0; -} - -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const struct kernel_symbol __start___ksymtab_gpl_future[]; -extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const struct kernel_symbol __start___ksymtab_gpl_future[]; -extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const unsigned long __start___kcrctab[]; -extern const unsigned long __start___kcrctab_gpl[]; -extern const unsigned long __start___kcrctab_gpl_future[]; -#ifdef CONFIG_UNUSED_SYMBOLS -extern const struct kernel_symbol __start___ksymtab_unused[]; -extern const struct kernel_symbol __stop___ksymtab_unused[]; -extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; -extern const unsigned long __start___kcrctab_unused[]; -extern const unsigned long __start___kcrctab_unused_gpl[]; -#endif - -#ifndef CONFIG_MODVERSIONS -#define symversion(base, idx) NULL -#else -#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) -#endif - -struct symsearch { - const struct kernel_symbol *start, *stop; - const unsigned long *crcs; - enum { - NOT_GPL_ONLY, - GPL_ONLY, - WILL_BE_GPL_ONLY, - } licence; - bool unused; -}; - -static bool each_symbol_in_section(const struct symsearch *arr, - unsigned int arrsize, - struct module *owner, - bool (*fn)(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data), - void *data) -{ - unsigned int i, j; - - for (j = 0; j < arrsize; j++) { - for (i = 0; i < arr[j].stop - arr[j].start; i++) - if (fn(&arr[j], owner, i, data)) - return true; - } - - return false; -} - -/* Returns true as soon as fn returns true, otherwise false. */ -static bool each_symbol(bool (*fn)(const struct symsearch *arr, - struct module *owner, - unsigned int symnum, void *data), - void *data) -{ - struct module *mod; - const struct symsearch arr[] = { - { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - NOT_GPL_ONLY, false }, - { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, - GPL_ONLY, false }, - { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, - __start___kcrctab_gpl_future, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { __start___ksymtab_unused, __stop___ksymtab_unused, - __start___kcrctab_unused, - NOT_GPL_ONLY, true }, - { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, - __start___kcrctab_unused_gpl, - GPL_ONLY, true }, -#endif - }; - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) - return true; - - list_for_each_entry(mod, &modules, list) { - struct symsearch arr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY, false }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, - GPL_ONLY, false }, - { mod->gpl_future_syms, - mod->gpl_future_syms + mod->num_gpl_future_syms, - mod->gpl_future_crcs, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, - mod->unused_syms + mod->num_unused_syms, - mod->unused_crcs, - NOT_GPL_ONLY, true }, - { mod->unused_gpl_syms, - mod->unused_gpl_syms + mod->num_unused_gpl_syms, - mod->unused_gpl_crcs, - GPL_ONLY, true }, -#endif - }; - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) - return true; - } - return false; -} - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const unsigned long *crc; - unsigned long value; -}; - -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) -{ - struct find_symbol_arg *fsa = data; - - if (strcmp(syms->start[symnum].name, fsa->name) != 0) - return false; - - if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) - return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); - } - } - -#ifdef CONFIG_UNUSED_SYMBOLS - if (syms->unused && fsa->warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", fsa->name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); - } -#endif - - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->value = syms->start[symnum].value; - return true; -} - -/* Find a symbol, return value, (optional) crc and (optional) module - * which owns it */ -static unsigned long find_symbol(const char *name, - struct module **owner, - const unsigned long **crc, - bool gplok, - bool warn) -{ - struct find_symbol_arg fsa; - - fsa.name = name; - fsa.gplok = gplok; - fsa.warn = warn; - - if (each_symbol(find_symbol_in_section, &fsa)) { - if (owner) - *owner = fsa.owner; - if (crc) - *crc = fsa.crc; - return fsa.value; - } - - DEBUGP("Failed to find symbol %s\n", name); - return -ENOENT; -} - -/* Search for module by name: must hold module_mutex. */ -static struct module *find_module(const char *name) -{ - struct module *mod; - - list_for_each_entry(mod, &modules, list) { - if (strcmp(mod->name, name) == 0) - return mod; - } - return NULL; -} - -#ifdef CONFIG_SMP -/* Number of blocks used and allocated. */ -static unsigned int pcpu_num_used, pcpu_num_allocated; -/* Size of each block. -ve means used. */ -static int *pcpu_size; - -static int split_block(unsigned int i, unsigned short size) -{ - /* Reallocation required? */ - if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new; - - new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, - GFP_KERNEL); - if (!new) - return 0; - - pcpu_num_allocated *= 2; - pcpu_size = new; - } - - /* Insert a new subblock */ - memmove(&pcpu_size[i+1], &pcpu_size[i], - sizeof(pcpu_size[0]) * (pcpu_num_used - i)); - pcpu_num_used++; - - pcpu_size[i+1] -= size; - pcpu_size[i] = size; - return 1; -} - -static inline unsigned int block_size(int val) -{ - if (val < 0) - return -val; - return val; -} - -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - unsigned long extra; - unsigned int i; - void *ptr; - - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - ptr = __per_cpu_start; - for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - /* Extra for alignment requirement. */ - extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; - BUG_ON(i == 0 && extra != 0); - - if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) - continue; - - /* Transfer extra to previous block. */ - if (pcpu_size[i-1] < 0) - pcpu_size[i-1] -= extra; - else - pcpu_size[i-1] += extra; - pcpu_size[i] -= extra; - ptr += extra; - - /* Split block if warranted */ - if (pcpu_size[i] - size > sizeof(unsigned long)) - if (!split_block(i, size)) - return NULL; - - /* Mark allocated */ - pcpu_size[i] = -pcpu_size[i]; - return ptr; - } - - printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", - size); - return NULL; -} - -static void percpu_modfree(void *freeme) -{ - unsigned int i; - void *ptr = __per_cpu_start + block_size(pcpu_size[0]); - - /* First entry is core kernel percpu data. */ - for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - if (ptr == freeme) { - pcpu_size[i] = -pcpu_size[i]; - goto free; - } - } - BUG(); - - free: - /* Merge with previous? */ - if (pcpu_size[i-1] >= 0) { - pcpu_size[i-1] += pcpu_size[i]; - pcpu_num_used--; - memmove(&pcpu_size[i], &pcpu_size[i+1], - (pcpu_num_used - i) * sizeof(pcpu_size[0])); - i--; - } - /* Merge with next? */ - if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { - pcpu_size[i] += pcpu_size[i+1]; - pcpu_num_used--; - memmove(&pcpu_size[i+1], &pcpu_size[i+2], - (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); - } -} - -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - -static int percpu_modinit(void) -{ - pcpu_num_used = 2; - pcpu_num_allocated = 2; - pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, - GFP_KERNEL); - /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); - /* Free room. */ - pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; - if (pcpu_size[1] < 0) { - printk(KERN_ERR "No per-cpu room for modules.\n"); - pcpu_num_used = 1; - } - - return 0; -} -__initcall(percpu_modinit); -#else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - return NULL; -} -static inline void percpu_modfree(void *pcpuptr) -{ - BUG(); -} -static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return 0; -} -static inline void percpu_modcopy(void *pcpudst, const void *src, - unsigned long size) -{ - /* pcpusec should be 0, and size of that section should be 0. */ - BUG_ON(size != 0); -} -#endif /* CONFIG_SMP */ - -#define MODINFO_ATTR(field) \ -static void setup_modinfo_##field(struct module *mod, const char *s) \ -{ \ - mod->field = kstrdup(s, GFP_KERNEL); \ -} \ -static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ - struct module *mod, char *buffer) \ -{ \ - return sprintf(buffer, "%s\n", mod->field); \ -} \ -static int modinfo_##field##_exists(struct module *mod) \ -{ \ - return mod->field != NULL; \ -} \ -static void free_modinfo_##field(struct module *mod) \ -{ \ - kfree(mod->field); \ - mod->field = NULL; \ -} \ -static struct module_attribute modinfo_##field = { \ - .attr = { .name = __stringify(field), .mode = 0444 }, \ - .show = show_modinfo_##field, \ - .setup = setup_modinfo_##field, \ - .test = modinfo_##field##_exists, \ - .free = free_modinfo_##field, \ -}; - -MODINFO_ATTR(version); -MODINFO_ATTR(srcversion); - -static char last_unloaded_module[MODULE_NAME_LEN+1]; - -#ifdef CONFIG_MODULE_UNLOAD -/* Init the unload section of the module. */ -static void module_unload_init(struct module *mod) -{ - unsigned int i; - - INIT_LIST_HEAD(&mod->modules_which_use_me); - for (i = 0; i < NR_CPUS; i++) - local_set(&mod->ref[i].count, 0); - /* Hold reference count during initialization. */ - local_set(&mod->ref[raw_smp_processor_id()].count, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; -} - -/* modules using other modules */ -struct module_use -{ - struct list_head list; - struct module *module_which_uses; -}; - -/* Does a already use b? */ -static int already_uses(struct module *a, struct module *b) -{ - struct module_use *use; - - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { - DEBUGP("%s uses %s!\n", a->name, b->name); - return 1; - } - } - DEBUGP("%s does not use %s!\n", a->name, b->name); - return 0; -} - -/* Module a uses b */ -static int use_module(struct module *a, struct module *b) -{ - struct module_use *use; - int no_warn, err; - - if (b == NULL || already_uses(a, b)) return 1; - - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); - return 0; - } - - /* If strong_try_module_get() returned a different error, we fail. */ - if (err) - return 0; - - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); - module_put(b); - return 0; - } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); - return 1; -} - -/* Clear the unload stuff of the module. */ -static void module_unload_free(struct module *mod) -{ - struct module *i; - - list_for_each_entry(i, &modules, list) { - struct module_use *use; - - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } - } -} - -#ifdef CONFIG_MODULE_FORCE_UNLOAD -static inline int try_force_unload(unsigned int flags) -{ - int ret = (flags & O_TRUNC); - if (ret) - add_taint(TAINT_FORCED_RMMOD); - return ret; -} -#else -static inline int try_force_unload(unsigned int flags) -{ - return 0; -} -#endif /* CONFIG_MODULE_FORCE_UNLOAD */ - -struct stopref -{ - struct module *mod; - int flags; - int *forced; -}; - -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; - - /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) - return -EWOULDBLOCK; - } - - /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; - - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } -} - -unsigned int module_refcount(struct module *mod) -{ - unsigned int i, total = 0; - - for (i = 0; i < NR_CPUS; i++) - total += local_read(&mod->ref[i].count); - return total; -} -EXPORT_SYMBOL(module_refcount); - -/* This exists whether we can unload or not */ -static void free_module(struct module *mod); - -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - DEBUGP("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - -SYSCALL_DEFINE2(delete_module, const char __user *, name_user, - unsigned int, flags) -{ - struct module *mod; - char name[MODULE_NAME_LEN]; - int ret, forced = 0; - - if (!capable(CAP_SYS_MODULE)) - return -EPERM; - - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; - - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - - mod = find_module(name); - if (!mod) { - ret = -ENOENT; - goto out; - } - - if (!list_empty(&mod->modules_which_use_me)) { - /* Other modules depend on us: get rid of them first. */ - ret = -EWOULDBLOCK; - goto out; - } - - /* Doing init or already dying? */ - if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ - DEBUGP("%s already dying\n", mod->name); - ret = -EBUSY; - goto out; - } - - /* If it has an init func, it must have an exit func to unload */ - if (mod->init && !mod->exit) { - forced = try_force_unload(flags); - if (!forced) { - /* This module can't be removed */ - ret = -EBUSY; - goto out; - } - } - - /* Set this up before setting mod->state */ - mod->waiter = current; - - /* Stop the machine so refcounts can't move and disable module. */ - ret = try_stop_module(mod, flags, &forced); - if (ret != 0) - goto out; - - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - - mutex_unlock(&module_mutex); - /* Final destruction now noone is using it. */ - if (mod->exit != NULL) - mod->exit(); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); - /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - free_module(mod); - - out: - mutex_unlock(&module_mutex); - return ret; -} - -static void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %u ", module_refcount(mod)); - - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { - printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_printf(m, "[permanent],"); - } - - if (!printed_something) - seq_printf(m, "-"); -} - -void __symbol_put(const char *symbol) -{ - struct module *owner; - - preempt_disable(); - if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) - BUG(); - module_put(owner); - preempt_enable(); -} -EXPORT_SYMBOL(__symbol_put); - -void symbol_put_addr(void *addr) -{ - struct module *modaddr; - - if (core_kernel_text((unsigned long)addr)) - return; - - if (!(modaddr = module_text_address((unsigned long)addr))) - BUG(); - module_put(modaddr); -} -EXPORT_SYMBOL_GPL(symbol_put_addr); - -static ssize_t show_refcnt(struct module_attribute *mattr, - struct module *mod, char *buffer) -{ - return sprintf(buffer, "%u\n", module_refcount(mod)); -} - -static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444 }, - .show = show_refcnt, -}; - -void module_put(struct module *module) -{ - if (module) { - unsigned int cpu = get_cpu(); - local_dec(&module->ref[cpu].count); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); - put_cpu(); - } -} -EXPORT_SYMBOL(module_put); - -#else /* !CONFIG_MODULE_UNLOAD */ -static void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); -} - -static inline void module_unload_free(struct module *mod) -{ -} - -static inline int use_module(struct module *a, struct module *b) -{ - return strong_try_module_get(b) == 0; -} - -static inline void module_unload_init(struct module *mod) -{ -} -#endif /* CONFIG_MODULE_UNLOAD */ - -static ssize_t show_initstate(struct module_attribute *mattr, - struct module *mod, char *buffer) -{ - const char *state = "unknown"; - - switch (mod->state) { - case MODULE_STATE_LIVE: - state = "live"; - break; - case MODULE_STATE_COMING: - state = "coming"; - break; - case MODULE_STATE_GOING: - state = "going"; - break; - } - return sprintf(buffer, "%s\n", state); -} - -static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444 }, - .show = show_initstate, -}; - -static struct module_attribute *modinfo_attrs[] = { - &modinfo_version, - &modinfo_srcversion, - &initstate, -#ifdef CONFIG_MODULE_UNLOAD - &refcnt, -#endif - NULL, -}; - -static const char vermagic[] = VERMAGIC_STRING; - -static int try_to_force_load(struct module *mod, const char *symname) -{ -#ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) - printk("%s: no version for \"%s\" found: kernel tainted.\n", - mod->name, symname); - add_taint_module(mod, TAINT_FORCED_MODULE); - return 0; -#else - return -ENOEXEC; -#endif -} - -#ifdef CONFIG_MODVERSIONS -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *symname, - struct module *mod, - const unsigned long *crc) -{ - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (versions[i].crc == *crc) - return 1; - DEBUGP("Found checksum %lX vs module %lX\n", - *crc, versions[i].crc); - goto bad_version; - } - - printk(KERN_WARNING "%s: no symbol version for %s\n", - mod->name, symname); - return 0; - -bad_version: - printk("%s: disagrees about version of symbol %s\n", - mod->name, symname); - return 0; -} - -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, - struct module *mod) -{ - const unsigned long *crc; - - if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) - BUG(); - return check_version(sechdrs, versindex, "struct_module", mod, crc); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *symname, - struct module *mod, - const unsigned long *crc) -{ - return 1; -} - -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ -static unsigned long resolve_symbol(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *name, - struct module *mod) -{ - struct module *owner; - unsigned long ret; - const unsigned long *crc; - - ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); - if (!IS_ERR_VALUE(ret)) { - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (!check_version(sechdrs, versindex, name, mod, crc) || - !use_module(mod, owner)) - ret = -EINVAL; - } - return ret; -} - -/* - * /sys/module/foo/sections stuff - * J. Corbet - */ -#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) -struct module_sect_attr -{ - struct module_attribute mattr; - char *name; - unsigned long address; -}; - -struct module_sect_attrs -{ - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[0]; -}; - -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module *mod, char *buf) -{ - struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC) - nloaded++; - size[0] = ALIGN(sizeof(*sect_attrs) - + nloaded * sizeof(sect_attrs->attrs[0]), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < nsect; i++) { - if (! (sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - sattr->address = sechdrs[i].sh_addr; - sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, - GFP_KERNEL); - if (sattr->name == NULL) - goto out; - sect_attrs->nsections++; - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUGO; - *(gattr++) = &(sattr++)->mattr.attr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[0]; -}; - -static ssize_t module_notes_read(struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < nsect; i++) - if ((sechdrs[i].sh_flags & SHF_ALLOC) && - (sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(sizeof(*notes_attrs) - + notes * sizeof(notes_attrs->attrs[0]), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < nsect; ++i) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (sechdrs[i].sh_type == SHT_NOTE) { - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; - nattr->attr.mode = S_IRUGO; - nattr->size = sechdrs[i].sh_size; - nattr->private = (void *) sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif - -#ifdef CONFIG_SYSFS -int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); - ++temp_attr; - } - } - return error; -} - -void module_remove_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - printk(KERN_ERR "%s: module sysfs not initialized\n", - mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - printk(KERN_ERR "%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - kobject_put(&mod->mkobj.kobj); - - /* delay uevent until full sysfs population */ -out: - return err; -} - -int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); - return 0; - -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - kobject_put(&mod->mkobj.kobj); - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - kobject_put(&mod->mkobj.kobj); -} - -#else /* CONFIG_SYSFS */ - -static void mod_sysfs_fini(struct module *mod) -{ -} - -#endif /* CONFIG_SYSFS */ - -static void mod_kobject_remove(struct module *mod) -{ - module_remove_modinfo_attrs(mod); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - -/* - * link the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __link_module(void *_mod) -{ - struct module *mod = _mod; - list_add(&mod->list, &modules); - return 0; -} - -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - return 0; -} - -/* Free a module, remove from lists, etc (must hold module_mutex). */ -static void free_module(struct module *mod) -{ - /* Delete from various lists */ - stop_machine(__unlink_module, mod, NULL); - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_remove(mod); - - unwind_remove_table(mod->unwind_info, 0); - - /* Arch-specific cleanup. */ - module_arch_cleanup(mod); - - /* Module unload stuff */ - module_unload_free(mod); - - /* This may be NULL, but that's OK */ - module_free(mod, mod->module_init); - kfree(mod->args); - if (mod->percpu) - percpu_modfree(mod->percpu); - - /* Free lock-classes: */ - lockdep_free_key_range(mod->module_core, mod->core_size); - - /* Finally, free the core (containing the module structure) */ - module_free(mod, mod->module_core); -} - -void *__symbol_get(const char *symbol) -{ - struct module *owner; - unsigned long value; - - preempt_disable(); - value = find_symbol(symbol, &owner, NULL, true, true); - if (IS_ERR_VALUE(value)) - value = 0; - else if (strong_try_module_get(owner)) - value = 0; - preempt_enable(); - - return (void *)value; -} -EXPORT_SYMBOL_GPL(__symbol_get); - -/* - * Ensure that an exported symbol [global namespace] does not already exist - * in the kernel or in some other module's exported symbol table. - */ -static int verify_export_symbols(struct module *mod) -{ - unsigned int i; - struct module *owner; - const struct kernel_symbol *s; - struct { - const struct kernel_symbol *sym; - unsigned int num; - } arr[] = { - { mod->syms, mod->num_syms }, - { mod->gpl_syms, mod->num_gpl_syms }, - { mod->gpl_future_syms, mod->num_gpl_future_syms }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, mod->num_unused_syms }, - { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, -#endif - }; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (!IS_ERR_VALUE(find_symbol(s->name, &owner, - NULL, true, false))) { - printk(KERN_ERR - "%s: exports duplicate symbol %s" - " (owned by %s)\n", - mod->name, s->name, module_name(owner)); - return -ENOEXEC; - } - } - } - return 0; -} - -/* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - unsigned int versindex, - unsigned int pcpuindex, - struct module *mod) -{ - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; - unsigned long secbase; - unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - int ret = 0; - - for (i = 1; i < n; i++) { - switch (sym[i].st_shndx) { - case SHN_COMMON: - /* We compiled with -fno-common. These are not - supposed to happen. */ - DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); - printk("%s: please compile with -fno-common\n", - mod->name); - ret = -ENOEXEC; - break; - - case SHN_ABS: - /* Don't need to do anything */ - DEBUGP("Absolute symbol: 0x%08lx\n", - (long)sym[i].st_value); - break; - - case SHN_UNDEF: - sym[i].st_value - = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); - - /* Ok if resolved. */ - if (!IS_ERR_VALUE(sym[i].st_value)) - break; - /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) - break; - - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; - break; - - default: - /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == pcpuindex) - secbase = (unsigned long)mod->percpu; - else - secbase = sechdrs[sym[i].st_shndx].sh_addr; - sym[i].st_value += secbase; - break; - } - } - - return ret; -} - -/* Update size with this section: return offset. */ -static long get_offset(unsigned int *size, Elf_Shdr *sechdr) -{ - long ret; - - ret = ALIGN(*size, sechdr->sh_addralign ?: 1); - *size = ret + sechdr->sh_size; - return ret; -} - -/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld - might -- code, read-only data, read-write data, small data. Tally - sizes, and place the offsets into sh_entsize fields: high bit means it - belongs in init. */ -static void layout_sections(struct module *mod, - const Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - static unsigned long const masks[][2] = { - /* NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below */ - { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, - { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, - { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, - { ARCH_SHF_SMALL | SHF_ALLOC, 0 } - }; - unsigned int m, i; - - for (i = 0; i < hdr->e_shnum; i++) - sechdrs[i].sh_entsize = ~0UL; - - DEBUGP("Core section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || strncmp(secstrings + s->sh_name, - ".init", 5) == 0) - continue; - s->sh_entsize = get_offset(&mod->core_size, s); - DEBUGP("\t%s\n", secstrings + s->sh_name); - } - if (m == 0) - mod->core_text_size = mod->core_size; - } - - DEBUGP("Init section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || strncmp(secstrings + s->sh_name, - ".init", 5) != 0) - continue; - s->sh_entsize = (get_offset(&mod->init_size, s) - | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", secstrings + s->sh_name); - } - if (m == 0) - mod->init_text_size = mod->init_size; - } -} - -static void set_license(struct module *mod, const char *license) -{ - if (!license) - license = "unspecified"; - - if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints " - "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - } -} - -/* Parse tag=value strings from .modinfo section */ -static char *next_string(char *string, unsigned long *secsize) -{ - /* Skip non-zero chars */ - while (string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - - /* Skip any zero padding. */ - while (!string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - return string; -} - -static char *get_modinfo(Elf_Shdr *sechdrs, - unsigned int info, - const char *tag) -{ - char *p; - unsigned int taglen = strlen(tag); - unsigned long size = sechdrs[info].sh_size; - - for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { - if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') - return p + taglen + 1; - } - return NULL; -} - -static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, - unsigned int infoindex) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (attr->setup) - attr->setup(mod, - get_modinfo(sechdrs, - infoindex, - attr->attr.name)); - } -} - -#ifdef CONFIG_KALLSYMS - -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - -static int is_exported(const char *name, const struct module *mod) -{ - if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) - return 1; - else - if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) - return 1; - else - return 0; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, - Elf_Shdr *sechdrs, - const char *secstrings, - struct module *mod) -{ - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug", strlen(".debug")) == 0) - return 'n'; - return '?'; -} - -static void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const char *secstrings) -{ - unsigned int i; - - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; - - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info - = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); -} -#else -static inline void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const char *secstrings) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void *module_alloc_update_bounds(unsigned long size) -{ - void *ret = module_alloc(size); - - if (ret) { - /* Update module bounds. */ - if ((unsigned long)ret < module_addr_min) - module_addr_min = (unsigned long)ret; - if ((unsigned long)ret + size > module_addr_max) - module_addr_max = (unsigned long)ret + size; - } - return ret; -} - -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static noinline struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) -{ - Elf_Ehdr *hdr; - Elf_Shdr *sechdrs; - char *secstrings, *args, *modmagic, *strtab = NULL; - unsigned int i; - unsigned int symindex = 0; - unsigned int strindex = 0; - unsigned int setupindex; - unsigned int exindex; - unsigned int exportindex; - unsigned int modindex; - unsigned int obsparmindex; - unsigned int infoindex; - unsigned int gplindex; - unsigned int crcindex; - unsigned int gplcrcindex; - unsigned int versindex; - unsigned int pcpuindex; - unsigned int gplfutureindex; - unsigned int gplfuturecrcindex; - unsigned int unwindex = 0; -#ifdef CONFIG_UNUSED_SYMBOLS - unsigned int unusedindex; - unsigned int unusedcrcindex; - unsigned int unusedgplindex; - unsigned int unusedgplcrcindex; -#endif - unsigned int markersindex; - unsigned int markersstringsindex; - struct module *mod; - long err = 0; - void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - struct exception_table_entry *extable; - mm_segment_t old_fs; - - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); - if (len < sizeof(*hdr)) - return ERR_PTR(-ENOEXEC); - - /* Suck in entire file: we'll want most of it. */ - /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return ERR_PTR(-ENOMEM); - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; - } - - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(*sechdrs)) { - err = -ENOEXEC; - goto free_hdr; - } - - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) - goto truncated; - - /* Convenience variables */ - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - sechdrs[0].sh_addr = 0; - - for (i = 1; i < hdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_NOBITS - && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) - goto truncated; - - /* Mark all sections sh_addr with their address in the - temporary image. */ - sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; - - /* Internal symbols and strings. */ - if (sechdrs[i].sh_type == SHT_SYMTAB) { - symindex = i; - strindex = sechdrs[i].sh_link; - strtab = (char *)hdr + sechdrs[strindex].sh_offset; - } -#ifndef CONFIG_MODULE_UNLOAD - /* Don't load .exit sections */ - if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; -#endif - } - - modindex = find_sec(hdr, sechdrs, secstrings, - ".gnu.linkonce.this_module"); - if (!modindex) { - printk(KERN_WARNING "No module found in object\n"); - err = -ENOEXEC; - goto free_hdr; - } - mod = (void *)sechdrs[modindex].sh_addr; - - if (symindex == 0) { - printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", - mod->name); - err = -ENOEXEC; - goto free_hdr; - } - - /* Optional sections */ - exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); - gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); - gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); - crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); - gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); -#ifdef CONFIG_UNUSED_SYMBOLS - unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); - unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); - unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); - unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); -#endif - setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); - exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); - obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); - versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); - infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); - pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif - - /* Don't keep modinfo and version sections. */ - sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; - - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(sechdrs, versindex, mod)) { - err = -ENOEXEC; - goto free_hdr; - } - - modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); - /* This is allowed: modprobe --force will invalidate it. */ - if (!modmagic) { - err = try_to_force_load(mod, "magic"); - if (err) - goto free_hdr; - } else if (!same_magic(modmagic, vermagic, versindex)) { - printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); - err = -ENOEXEC; - goto free_hdr; - } - - /* Now copy in args */ - args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(args)) { - err = PTR_ERR(args); - goto free_hdr; - } - - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - - mod->state = MODULE_STATE_COMING; - - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); - if (err < 0) - goto free_mod; - - if (pcpuindex) { - /* We have a special allocation for this section. */ - percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign, - mod->name); - if (!percpu) { - err = -ENOMEM; - goto free_mod; - } - sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - mod->percpu = percpu; - } - - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, hdr, sechdrs, secstrings); - - /* Do the allocs. */ - ptr = module_alloc_update_bounds(mod->core_size); - if (!ptr) { - err = -ENOMEM; - goto free_percpu; - } - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; - - ptr = module_alloc_update_bounds(mod->init_size); - if (!ptr && mod->init_size) { - err = -ENOMEM; - goto free_core; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; - - /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); - for (i = 0; i < hdr->e_shnum; i++) { - void *dest; - - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init - + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); - else - dest = mod->module_core + sechdrs[i].sh_entsize; - - if (sechdrs[i].sh_type != SHT_NOBITS) - memcpy(dest, (void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size); - /* Update sh_addr to point to copy in image. */ - sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); - } - /* Module has been moved. */ - mod = (void *)sechdrs[modindex].sh_addr; - - /* Now we've moved module, initialize linked lists, etc. */ - module_unload_init(mod); - - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - - /* - * ndiswrapper is under GPL by itself, but loads proprietary modules. - * Don't use add_taint_module(), as it would prevent ndiswrapper from - * using GPL-only symbols it needs. - */ - if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); - - /* driverloader was caught wrongly pretending to be under GPL */ - if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, sechdrs, infoindex); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, - mod); - if (err < 0) - goto cleanup; - - /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ - mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); - mod->syms = (void *)sechdrs[exportindex].sh_addr; - if (crcindex) - mod->crcs = (void *)sechdrs[crcindex].sh_addr; - mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); - mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; - if (gplcrcindex) - mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; - mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / - sizeof(*mod->gpl_future_syms); - mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; - if (gplfuturecrcindex) - mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->num_unused_syms = sechdrs[unusedindex].sh_size / - sizeof(*mod->unused_syms); - mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / - sizeof(*mod->unused_gpl_syms); - mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; - if (unusedcrcindex) - mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; - mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; - if (unusedgplcrcindex) - mod->unused_gpl_crcs - = (void *)sechdrs[unusedgplcrcindex].sh_addr; -#endif - -#ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !crcindex) - || (mod->num_gpl_syms && !gplcrcindex) - || (mod->num_gpl_future_syms && !gplfuturecrcindex) -#ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !unusedcrcindex) - || (mod->num_unused_gpl_syms && !unusedgplcrcindex) -#endif - ) { - printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); - err = try_to_force_load(mod, "nocrc"); - if (err) - goto cleanup; - } -#endif - markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); - markersstringsindex = find_sec(hdr, sechdrs, secstrings, - "__markers_strings"); - - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { - const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(sechdrs[info].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(sechdrs, strtab, symindex, i,mod); - else if (sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(sechdrs, strtab, symindex, i, - mod); - if (err < 0) - goto cleanup; - } -#ifdef CONFIG_MARKERS - mod->markers = (void *)sechdrs[markersindex].sh_addr; - mod->num_markers = - sechdrs[markersindex].sh_size / sizeof(*mod->markers); -#endif - - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - - if (err < 0) - goto cleanup; - - /* Set up and sort exception table */ - mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); - mod->extable = extable = (void *)sechdrs[exindex].sh_addr; - sort_extable(extable, extable + mod->num_exentries); - - /* Finally, copy percpu area over. */ - percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, - sechdrs[pcpuindex].sh_size); - - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); - -#ifdef CONFIG_MARKERS - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); -#endif - err = module_finalize(hdr, sechdrs, mod); - if (err < 0) - goto cleanup; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Flush the instruction cache, since we've played with text. - * Do it before processing of module parameters, so the module - * can provide parameter accessor functions of its own. - */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); - - mod->args = args; - if (obsparmindex) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since - * strong_try_module_get() will fail. */ - stop_machine(__link_module, mod, NULL); - - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); - if (err < 0) - goto unlink; - - err = mod_sysfs_setup(mod, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param)); - if (err < 0) - goto unlink; - add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - - /* Get rid of temporary copy */ - vfree(hdr); - - /* Done! */ - return mod; - - unlink: - stop_machine(__unlink_module, mod, NULL); - module_arch_cleanup(mod); - cleanup: - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: - module_unload_free(mod); - module_free(mod, mod->module_init); - free_core: - module_free(mod, mod->module_core); - free_percpu: - if (percpu) - percpu_modfree(percpu); - free_mod: - kfree(args); - free_hdr: - vfree(hdr); - return ERR_PTR(err); - - truncated: - printk(KERN_ERR "Module len %lu truncated\n", len); - err = -ENOEXEC; - goto free_hdr; -} - -/* This is where the real work happens */ -SYSCALL_DEFINE3(init_module, void __user *, umod, - unsigned long, len, const char __user *, uargs) -{ - struct module *mod; - int ret = 0; - - /* Must have permission */ - if (!capable(CAP_SYS_MODULE)) - return -EPERM; - - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); - return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); - - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); - free_module(mod); - mutex_unlock(&module_mutex); - wake_up(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " - "it should follow 0/-E convention\n" - KERN_WARNING "%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } - - /* Now it's a first class citizen! Wake up anyone waiting for it. */ - mod->state = MODULE_STATE_LIVE; - wake_up(&module_wq); - - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - unwind_remove_table(mod->unwind_info, 1); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); - - return 0; -} - -static inline int within(unsigned long addr, void *start, unsigned long size) -{ - return ((void *)addr >= start && (void *)addr < start + size); -} - -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - return str[0] == '$' && strchr("atd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *get_ksymbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval; - - /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; - else - nextval = (unsigned long)mod->module_core+mod->core_text_size; - - /* Scan for closest preceeding symbol, and next symbol. (ELF - starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) - continue; - - /* We ignore unnamed symbols: they're uninformative - * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - mod->symtab[best].st_value; - if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; -} - -/* For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - char *namebuf) -{ - struct module *mod; - const char *ret = NULL; - - preempt_disable(); - list_for_each_entry(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { - if (modname) - *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); - break; - } - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { - const char *sym; - - sym = get_ksymbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { - const char *sym; - - sym = get_ksymbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry(mod, &modules, list) { - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, mod); - preempt_enable(); - return 0; - } - symnum -= mod->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -static unsigned long mod_find_symname(struct module *mod, const char *name) -{ - unsigned int i; - - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strchr(name, ':')) != NULL) { - *colon = '\0'; - if ((mod = find_module(name)) != NULL) - ret = mod_find_symname(mod, colon+1); - *colon = ':'; - } else { - list_for_each_entry(mod, &modules, list) - if ((ret = mod_find_symname(mod, name)) != 0) - break; - } - preempt_enable(); - return ret; -} -#endif /* CONFIG_KALLSYMS */ - -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static char *module_flags(struct module *mod, char *buf) -{ - int bx = 0; - - if (mod->taints || - mod->state == MODULE_STATE_GOING || - mod->state == MODULE_STATE_COMING) { - buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) - buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) - buf[bx++] = 'F'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - - /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) - buf[bx++] = '-'; - /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) - buf[bx++] = '+'; - buf[bx++] = ')'; - } - buf[bx] = '\0'; - - return buf; -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[8]; - - seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": - "Live"); - /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_printf(m, "\n"); - return 0; -} - -/* Format: modulename size refcount deps address - - Where refcount is a number or -, and deps is a comma-separated list - of depends or -. -*/ -const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -/* Given an address, look for it in the module exception tables. */ -const struct exception_table_entry *search_module_extables(unsigned long addr) -{ - const struct exception_table_entry *e = NULL; - struct module *mod; - - preempt_disable(); - list_for_each_entry(mod, &modules, list) { - if (mod->num_exentries == 0) - continue; - - e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, - addr); - if (e) - break; - } - preempt_enable(); - - /* Now, if we found one, we are running inside it now, hence - we cannot unload the module, hence no refcnt needed. */ - return e; -} - -/* - * Is this a valid module address? - */ -int is_module_address(unsigned long addr) -{ - struct module *mod; - - preempt_disable(); - - list_for_each_entry(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { - preempt_enable(); - return 1; - } - } - - preempt_enable(); - - return 0; -} - - -/* Is this a valid kernel address? */ -struct module *__module_text_address(unsigned long addr) -{ - struct module *mod; - - if (addr < module_addr_min || addr > module_addr_max) - return NULL; - - list_for_each_entry(mod, &modules, list) - if (within(addr, mod->module_init, mod->init_text_size) - || within(addr, mod->module_core, mod->core_text_size)) - return mod; - return NULL; -} - -struct module *module_text_address(unsigned long addr) -{ - struct module *mod; - - preempt_disable(); - mod = __module_text_address(addr); - preempt_enable(); - - return mod; -} - -/* Don't grab lock, we're oopsing. */ -void print_modules(void) -{ - struct module *mod; - char buf[8]; - - printk("Modules linked in:"); - list_for_each_entry(mod, &modules, list) - printk(" %s%s", mod->name, module_flags(mod, buf)); - if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); -} - -#ifdef CONFIG_MODVERSIONS -/* Generate the signature for struct module here, too, for modversions. */ -void struct_module(struct module *mod) { return; } -EXPORT_SYMBOL(struct_module); -#endif - -#ifdef CONFIG_MARKERS -void module_update_markers(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); - mutex_unlock(&module_mutex); -} -#endif -/* - * kernel/mutex-debug.c - * - * Debugging code for mutexes - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * lock debugging, locking tree, deadlock detection started by: - * - * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Released under the General Public License (GPL). - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mutex-debug.h" - -/* - * Must be called with lock->wait_lock held. - */ -void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) -{ - lock->owner = new_owner; -} - -void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ - memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); - waiter->magic = waiter; - INIT_LIST_HEAD(&waiter->list); -} - -void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); - DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); -} - -void debug_mutex_free_waiter(struct mutex_waiter *waiter) -{ - DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); -} - -void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - - /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; - waiter->lock = lock; -} - -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; - - list_del_init(&waiter->list); - waiter->task = NULL; -} - -void debug_mutex_unlock(struct mutex *lock) -{ - if (unlikely(!debug_locks)) - return; - - DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); -} - -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->owner = NULL; - lock->magic = lock; -} - -/*** - * mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void mutex_destroy(struct mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); - lock->magic = NULL; -} - -EXPORT_SYMBOL_GPL(mutex_destroy); -/* - * kernel/mutex.c - * - * Mutexes: blocking mutual exclusion locks - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and - * David Howells for suggestions and improvements. - * - * Also see Documentation/mutex-design.txt. - */ -#include -#include -#include -#include -#include -#include - -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ -#ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" -# include -#else -# include "mutex.h" -# include -#endif - -/*** - * mutex_init - initialize the mutex - * @lock: the mutex to be initialized - * @key: the lock_class_key for the class; used by mutex lock debugging - * - * Initialize the mutex to unlocked state. - * - * It is not allowed to initialize an already locked mutex. - */ -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -{ - atomic_set(&lock->count, 1); - spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); - - debug_mutex_init(lock, name, key); -} - -EXPORT_SYMBOL(__mutex_init); - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * We split the mutex lock/unlock logic into separate fastpath and - * slowpath functions, to reduce the register pressure on the fastpath. - * We also put the fastpath first in the kernel image, to make sure the - * branch is predicted by the CPU as default-untaken. - */ -static void noinline __sched -__mutex_lock_slowpath(atomic_t *lock_count); - -/*** - * mutex_lock - acquire the mutex - * @lock: the mutex to be acquired - * - * Lock the mutex exclusively for this task. If the mutex is not - * available right now, it will sleep until it can get it. - * - * The mutex must later on be released by the same task that - * acquired it. Recursive locking is not allowed. The task - * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with - * the mutex still locked. The mutex must first be initialized - * (or statically defined) before it can be locked. memset()-ing - * the mutex to 0 is not allowed. - * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) - * - * This function is similar to (but not equivalent to) down(). - */ -void inline __sched mutex_lock(struct mutex *lock) -{ - might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); -} - -EXPORT_SYMBOL(mutex_lock); -#endif - -static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); - -/*** - * mutex_unlock - release the mutex - * @lock: the mutex to be released - * - * Unlock a mutex that has been locked by this task previously. - * - * This function must not be used in interrupt context. Unlocking - * of a not locked mutex is not allowed. - * - * This function is similar to (but not equivalent to) up(). - */ -void __sched mutex_unlock(struct mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); -} - -EXPORT_SYMBOL(mutex_unlock); - -/* - * Lock a mutex (possibly interruptible), slowpath: - */ -static inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - unsigned long ip) -{ - struct task_struct *task = current; - struct mutex_waiter waiter; - unsigned int old_val; - unsigned long flags; - - spin_lock_mutex(&lock->wait_lock, flags); - - debug_mutex_lock_common(lock, &waiter); - mutex_acquire(&lock->dep_map, subclass, 0, ip); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); - - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; - - old_val = atomic_xchg(&lock->count, -1); - if (old_val == 1) - goto done; - - lock_contended(&lock->dep_map, ip); - - for (;;) { - /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters: - */ - old_val = atomic_xchg(&lock->count, -1); - if (old_val == 1) - break; - - /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) - */ - if (unlikely(signal_pending_state(state, task))) { - mutex_remove_waiter(lock, &waiter, - task_thread_info(task)); - mutex_release(&lock->dep_map, 1, ip); - spin_unlock_mutex(&lock->wait_lock, flags); - - debug_mutex_free_waiter(&waiter); - return -EINTR; - } - __set_task_state(task, state); - - /* didnt get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock, flags); - schedule(); - spin_lock_mutex(&lock->wait_lock, flags); - } - -done: - lock_acquired(&lock->dep_map); - /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); - debug_mutex_set_owner(lock, task_thread_info(task)); - - /* set it to 0 if there are no waiters left: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - debug_mutex_free_waiter(&waiter); - - return 0; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __sched -mutex_lock_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); -} - -EXPORT_SYMBOL_GPL(mutex_lock_nested); - -int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); -} -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); - -int __sched -mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); -} - -EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -#endif - -/* - * Release the lock, slowpath: - */ -static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - - /* - * some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); - - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); - - debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); - } - - debug_mutex_clear_owner(lock); - - spin_unlock_mutex(&lock->wait_lock, flags); -} - -/* - * Release the lock, slowpath: - */ -static noinline void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - __mutex_unlock_common_slowpath(lock_count, 1); -} - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * Here come the less common (and hence less performance-critical) APIs: - * mutex_lock_interruptible() and mutex_trylock(). - */ -static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); - -static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); - -/*** - * mutex_lock_interruptible - acquire the mutex, interruptable - * @lock: the mutex to be acquired - * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. - * - * This function is similar to (but not equivalent to) down_interruptible(). - */ -int __sched mutex_lock_interruptible(struct mutex *lock) -{ - might_sleep(); - return __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); -} - -EXPORT_SYMBOL(mutex_lock_interruptible); - -int __sched mutex_lock_killable(struct mutex *lock) -{ - might_sleep(); - return __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_killable_slowpath); -} -EXPORT_SYMBOL(mutex_lock_killable); - -static noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); -} - -static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); -} - -static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); -} -#endif - -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg(&lock->count, -1); - if (likely(prev == 1)) { - debug_mutex_set_owner(lock, current_thread_info()); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - -/*** - * mutex_trylock - try acquire the mutex, without waiting - * @lock: the mutex to be acquired - * - * Try to acquire the mutex atomically. Returns 1 if the mutex - * has been acquired successfully, and 0 on contention. - * - * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful - * about this when converting semaphore users to mutexes. - * - * This function must not be used in interrupt context. The - * mutex must be released by the same task that acquired it. - */ -int __sched mutex_trylock(struct mutex *lock) -{ - return __mutex_fastpath_trylock(&lock->count, - __mutex_trylock_slowpath); -} - -EXPORT_SYMBOL(mutex_trylock); -#include -#include -#include -#include -#include -#include -#include - -/* - * Notifier list for kernel code which wants to be called - * at shutdown. This is used to stop any idling DMA operations - * and the like. - */ -BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); - -/* - * Notifier chain core routines. The exported routines below - * are layered on top of these, with appropriate locking added. - */ - -static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - -static int notifier_chain_cond_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) - return 0; - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - -static int notifier_chain_unregister(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) { - rcu_assign_pointer(*nl, n->next); - return 0; - } - nl = &((*nl)->next); - } - return -ENOENT; -} - -/** - * notifier_call_chain - Informs the registered notifiers about an event. - * @nl: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: Number of notifier functions to be called. Don't care - * value of this parameter is -1. - * @nr_calls: Records the number of notifications sent. Don't care - * value of this field is NULL. - * @returns: notifier_call_chain returns the value returned by the - * last notifier function called. - */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - struct notifier_block *nb, *next_nb; - - nb = rcu_dereference(*nl); - - while (nb && nr_to_call) { - next_nb = rcu_dereference(nb->next); - ret = nb->notifier_call(nb, val, v); - - if (nr_calls) - (*nr_calls)++; - - if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) - break; - nb = next_nb; - nr_to_call--; - } - return ret; -} - -/* - * Atomic notifier chain routines. Registration and unregistration - * use a spinlock, and call_chain is synchronized by RCU (no locks). - */ - -/** - * atomic_notifier_chain_register - Add notifier to an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an atomic notifier chain. - * - * Currently always returns zero. - */ -int atomic_notifier_chain_register(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); - -/** - * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an atomic notifier chain. - * - * Returns zero on success or %-ENOENT on failure. - */ -int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_unregister(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - synchronize_rcu(); - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); - -/** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in an atomic context, so they must not block. - * This routine uses RCU to synchronize with changes to the chain. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - - rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); - -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); - -/* - * Blocking notifier chain routines. All access to the chain is - * synchronized by an rwsem. - */ - -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); - -/** - * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain, only if not already - * present in the chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - down_write(&nh->rwsem); - ret = notifier_chain_cond_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); - -/** - * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a blocking notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_unregister(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); - -/** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - - /* - * We check the head outside the lock, but if this access is - * racy then it does not matter what the result of the test - * is, we re-check the list after having taken the lock anyway: - */ - if (rcu_dereference(nh->head)) { - down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); - up_read(&nh->rwsem); - } - return ret; -} -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); - -/* - * Raw notifier chain routines. There is no protection; - * the caller must provide it. Use at your own risk! - */ - -/** - * raw_notifier_chain_register - Add notifier to a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a raw notifier chain. - * All locking must be provided by the caller. - * - * Currently always returns zero. - */ -int raw_notifier_chain_register(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_register(&nh->head, n); -} -EXPORT_SYMBOL_GPL(raw_notifier_chain_register); - -/** - * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a raw notifier chain. - * All locking must be provided by the caller. - * - * Returns zero on success or %-ENOENT on failure. - */ -int raw_notifier_chain_unregister(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_unregister(&nh->head, n); -} -EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); - -/** - * __raw_notifier_call_chain - Call functions in a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in an undefined context. - * All locking must be provided by the caller. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - -int raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v) -{ - return __raw_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(raw_notifier_call_chain); - -/* - * SRCU notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by SRCU (no locks). - */ - -/** - * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an SRCU notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int srcu_notifier_chain_register(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); - mutex_unlock(&nh->mutex); - return ret; -} -EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); - -/** - * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an SRCU notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_unregister(&nh->head, n); - mutex_unlock(&nh->mutex); - synchronize_srcu(&nh->srcu); - return ret; -} -EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); - -/** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - int idx; - - idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - srcu_read_unlock(&nh->srcu, idx); - return ret; -} -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); - -/** - * srcu_init_notifier_head - Initialize an SRCU notifier head - * @nh: Pointer to head of the srcu notifier chain - * - * Unlike other sorts of notifier heads, SRCU notifier heads require - * dynamic initialization. Be sure to call this routine before - * calling any of the other SRCU notifier routines for this head. - * - * If an SRCU notifier head is deallocated, it must first be cleaned - * up by calling srcu_cleanup_notifier_head(). Otherwise the head's - * per-cpu data (used by the SRCU mechanism) will leak. - */ -void srcu_init_notifier_head(struct srcu_notifier_head *nh) -{ - mutex_init(&nh->mutex); - if (init_srcu_struct(&nh->srcu) < 0) - BUG(); - nh->head = NULL; -} -EXPORT_SYMBOL_GPL(srcu_init_notifier_head); - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -static ATOMIC_NOTIFIER_HEAD(die_chain); - -int notify_die(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - struct die_args args = { - .regs = regs, - .str = str, - .err = err, - .trapnr = trap, - .signr = sig, - - }; - return atomic_notifier_call_chain(&die_chain, val, &args); -} - -int register_die_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(register_die_notifier); - -int unregister_die_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_die_notifier); -/* - * ns_cgroup.c - namespace cgroup subsystem - * - * Copyright 2006, 2007 IBM Corp - */ - -#include -#include -#include -#include -#include -#include - -struct ns_cgroup { - struct cgroup_subsys_state css; - spinlock_t lock; -}; - -struct cgroup_subsys ns_subsys; - -static inline struct ns_cgroup *cgroup_to_ns( - struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), - struct ns_cgroup, css); -} - -int ns_cgroup_clone(struct task_struct *task, struct pid *pid) -{ - char name[PROC_NUMBUF]; - - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); - return cgroup_clone(task, &ns_subsys, name); -} - -/* - * Rules: - * 1. you can only enter a cgroup which is a child of your current - * cgroup - * 2. you can only place another process into a cgroup if - * a. you have CAP_SYS_ADMIN - * b. your cgroup is an ancestor of task's destination cgroup - * (hence either you are in the same cgroup as task, or in an - * ancestor cgroup thereof) - */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) -{ - struct cgroup *orig; - - if (current != task) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!cgroup_is_descendant(new_cgroup)) - return -EPERM; - } - - if (atomic_read(&new_cgroup->count) != 0) - return -EPERM; - - orig = task_cgroup(task, ns_subsys_id); - if (orig && orig != new_cgroup->parent) - return -EPERM; - - return 0; -} - -/* - * Rules: you can only create a cgroup if - * 1. you are capable(CAP_SYS_ADMIN) - * 2. the target cgroup is a descendant of your own cgroup - */ -static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (!cgroup_is_descendant(cgroup)) - return ERR_PTR(-EPERM); - - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); - if (!ns_cgroup) - return ERR_PTR(-ENOMEM); - spin_lock_init(&ns_cgroup->lock); - return &ns_cgroup->css; -} - -static void ns_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - ns_cgroup = cgroup_to_ns(cgroup); - kfree(ns_cgroup); -} - -struct cgroup_subsys ns_subsys = { - .name = "ns", - .can_attach = ns_can_attach, - .create = ns_create, - .destroy = ns_destroy, - .subsys_id = ns_subsys_id, -}; -/* - * Copyright (C) 2006 IBM Corporation - * - * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - * Jun 2006 - namespaces support - * OpenVZ, SWsoft Inc. - * Pavel Emelianov - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *nsproxy_cachep; - -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); - -/* - * creates a copy of "orig" with refcount 1. - */ -static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) -{ - struct nsproxy *ns; - - ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); - if (ns) { - memcpy(ns, orig, sizeof(struct nsproxy)); - atomic_set(&ns->count, 1); - } - return ns; -} - -/* - * Create new nsproxy and all of its the associated namespaces. - * Return the newly created nsproxy. Do not attach this to the task, - * leave it to the caller to do proper locking and attach it to task. - */ -static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) -{ - struct nsproxy *new_nsp; - int err; - - new_nsp = clone_nsproxy(tsk->nsproxy); - if (!new_nsp) - return ERR_PTR(-ENOMEM); - - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) { - err = PTR_ERR(new_nsp->mnt_ns); - goto out_ns; - } - - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); - if (IS_ERR(new_nsp->uts_ns)) { - err = PTR_ERR(new_nsp->uts_ns); - goto out_uts; - } - - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); - if (IS_ERR(new_nsp->ipc_ns)) { - err = PTR_ERR(new_nsp->ipc_ns); - goto out_ipc; - } - - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); - if (IS_ERR(new_nsp->pid_ns)) { - err = PTR_ERR(new_nsp->pid_ns); - goto out_pid; - } - - new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns); - if (IS_ERR(new_nsp->user_ns)) { - err = PTR_ERR(new_nsp->user_ns); - goto out_user; - } - - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); - if (IS_ERR(new_nsp->net_ns)) { - err = PTR_ERR(new_nsp->net_ns); - goto out_net; - } - - return new_nsp; - -out_net: - if (new_nsp->user_ns) - put_user_ns(new_nsp->user_ns); -out_user: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); -out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); -out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); -out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); -out_ns: - kmem_cache_free(nsproxy_cachep, new_nsp); - return ERR_PTR(err); -} - -/* - * called from clone. This now handles copy for nsproxy and all - * namespaces therein. - */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) -{ - struct nsproxy *old_ns = tsk->nsproxy; - struct nsproxy *new_ns; - int err = 0; - - if (!old_ns) - return 0; - - get_nsproxy(old_ns); - - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } - - /* - * CLONE_NEWIPC must detach from the undolist: after switching - * to a new ipc namespace, the semaphore arrays from the old - * namespace are unreachable. In clone parlance, CLONE_SYSVSEM - * means share undolist with parent, so we must forbid using - * it along with CLONE_NEWIPC. - */ - if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { - err = -EINVAL; - goto out; - } - - new_ns = create_new_namespaces(flags, tsk, tsk->fs); - if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto out; - } - - tsk->nsproxy = new_ns; - -out: - put_nsproxy(old_ns); - return err; -} - -void free_nsproxy(struct nsproxy *ns) -{ - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); - if (ns->user_ns) - put_user_ns(ns->user_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - -/* - * Called from unshare. Unshare all the namespaces part of nsproxy. - * On success, returns the new nsproxy. - */ -int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) -{ - int err = 0; - - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER | CLONE_NEWNET))) - return 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); - if (IS_ERR(*new_nsp)) { - err = PTR_ERR(*new_nsp); - goto out; - } - - err = ns_cgroup_clone(current, task_pid(current)); - if (err) - put_nsproxy(*new_nsp); - -out: - return err; -} - -void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) -{ - struct nsproxy *ns; - - might_sleep(); - - ns = p->nsproxy; - - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); - free_nsproxy(ns); - } -} - -void exit_task_namespaces(struct task_struct *p) -{ - switch_task_namespaces(p, NULL); -} - -static int __init nsproxy_cache_init(void) -{ - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); - return 0; -} - -module_init(nsproxy_cache_init); -/* - * linux/kernel/panic.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * This function is used through-out the kernel (including mm and fs) - * to indicate a major problem. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int panic_on_oops; -int tainted; -static int pause_on_oops; -static int pause_on_oops_flag; -static DEFINE_SPINLOCK(pause_on_oops_lock); - -int panic_timeout; - -ATOMIC_NOTIFIER_HEAD(panic_notifier_list); - -EXPORT_SYMBOL(panic_notifier_list); - -static int __init panic_setup(char *str) -{ - panic_timeout = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("panic=", panic_setup); - -static long no_blink(long time) -{ - return 0; -} - -/* Returns how long it waited in ms */ -long (*panic_blink)(long time); -EXPORT_SYMBOL(panic_blink); - -/** - * panic - halt the system - * @fmt: The text string to print - * - * Display a message, then perform cleanups. - * - * This function never returns. - */ - -NORET_TYPE void panic(const char * fmt, ...) -{ - long i; - static char buf[1024]; - va_list args; -#if defined(CONFIG_S390) - unsigned long caller = (unsigned long) __builtin_return_address(0); -#endif - - /* - * It's possible to come here directly from a panic-assertion and not - * have preempt disabled. Some functions called from here want - * preempt to be disabled. No point enabling it later though... - */ - preempt_disable(); - - bust_spinlocks(1); - va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); - bust_spinlocks(0); - - /* - * If we have crashed and we have a crash kernel loaded let it handle - * everything else. - * Do we want to call this before we try to display a message? - */ - crash_kexec(NULL); - -#ifdef CONFIG_SMP - /* - * Note smp_send_stop is the usual smp shutdown function, which - * unfortunately means it may not be hardened to work in a panic - * situation. - */ - smp_send_stop(); -#endif - - atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - - if (!panic_blink) - panic_blink = no_blink; - - if (panic_timeout > 0) { - /* - * Delay timeout seconds before rebooting the machine. - * We can't use the "normal" timers since we just panicked.. - */ - printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); - for (i = 0; i < panic_timeout*1000; ) { - touch_nmi_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; - } - /* This will not be a clean reboot, with everything - * shutting down. But if there is a chance of - * rebooting the system it will be rebooted. - */ - emergency_restart(); - } -#ifdef __sparc__ - { - extern int stop_a_enabled; - /* Make sure the user can actually press Stop-A (L1-A) */ - stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); - } -#endif -#if defined(CONFIG_S390) - disabled_wait(caller); -#endif - local_irq_enable(); - for (i = 0;;) { - touch_softlockup_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; - } -} - -EXPORT_SYMBOL(panic); - -/** - * print_tainted - return a string to represent the kernel taint state. - * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * - * The string is overwritten by the next call to print_taint(). - */ - -const char *print_tainted(void) -{ - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else - snprintf(buf, sizeof(buf), "Not tainted"); - return(buf); -} - -void add_taint(unsigned flag) -{ - debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; -} -EXPORT_SYMBOL(add_taint); - -static int __init pause_on_oops_setup(char *str) -{ - pause_on_oops = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("pause_on_oops=", pause_on_oops_setup); - -static void spin_msec(int msecs) -{ - int i; - - for (i = 0; i < msecs; i++) { - touch_nmi_watchdog(); - mdelay(1); - } -} - -/* - * It just happens that oops_enter() and oops_exit() are identically - * implemented... - */ -static void do_oops_enter_exit(void) -{ - unsigned long flags; - static int spin_counter; - - if (!pause_on_oops) - return; - - spin_lock_irqsave(&pause_on_oops_lock, flags); - if (pause_on_oops_flag == 0) { - /* This CPU may now print the oops message */ - pause_on_oops_flag = 1; - } else { - /* We need to stall this CPU */ - if (!spin_counter) { - /* This CPU gets to do the counting */ - spin_counter = pause_on_oops; - do { - spin_unlock(&pause_on_oops_lock); - spin_msec(MSEC_PER_SEC); - spin_lock(&pause_on_oops_lock); - } while (--spin_counter); - pause_on_oops_flag = 0; - } else { - /* This CPU waits for a different one */ - while (spin_counter) { - spin_unlock(&pause_on_oops_lock); - spin_msec(1); - spin_lock(&pause_on_oops_lock); - } - } - } - spin_unlock_irqrestore(&pause_on_oops_lock, flags); -} - -/* - * Return true if the calling CPU is allowed to print oops-related info. This - * is a bit racy.. - */ -int oops_may_print(void) -{ - return pause_on_oops_flag == 0; -} - -/* - * Called when the architecture enters its oops handler, before it prints - * anything. If this is the first CPU to oops, and it's oopsing the first time - * then let it proceed. - * - * This is all enabled by the pause_on_oops kernel boot option. We do all this - * to ensure that oopses don't scroll off the screen. It has the side-effect - * of preventing later-oopsing CPUs from mucking up the display, too. - * - * It turns out that the CPU which is allowed to print ends up pausing for the - * right duration, whereas all the other CPUs pause for twice as long: once in - * oops_enter(), once in oops_exit(). - */ -void oops_enter(void) -{ - debug_locks_off(); /* can't trust the integrity of the kernel anymore */ - do_oops_enter_exit(); -} - -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - - return 0; -} -late_initcall(init_oops_id); - -static void print_oops_end_marker(void) -{ - init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); -} - -/* - * Called when the architecture exits its oops handler, after printing - * everything. - */ -void oops_exit(void) -{ - do_oops_enter_exit(); - print_oops_end_marker(); -} - -#ifdef WANT_WARN_ON_SLOWPATH -void warn_on_slowpath(const char *file, int line) -{ - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long) __builtin_return_address(0); - sprint_symbol(function, caller); - - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); - print_modules(); - dump_stack(); - print_oops_end_marker(); - add_taint(TAINT_WARN); -} -EXPORT_SYMBOL(warn_on_slowpath); - - -void warn_slowpath(const char *file, int line, const char *fmt, ...) -{ - va_list args; - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long)__builtin_return_address(0); - sprint_symbol(function, caller); - - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - print_modules(); - dump_stack(); - print_oops_end_marker(); - add_taint(TAINT_WARN); -} -EXPORT_SYMBOL(warn_slowpath); -#endif - -#ifdef CONFIG_CC_STACKPROTECTOR -/* - * Called when gcc's -fstack-protector feature is used, and - * gcc detects corruption of the on-stack canary value - */ -void __stack_chk_fail(void) -{ - panic("stack-protector: Kernel stack is corrupted"); -} -EXPORT_SYMBOL(__stack_chk_fail); -#endif -/* Helpers for initial module or kernel cmdline parsing - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt, a...) -#endif - -static inline char dash2underscore(char c) -{ - if (c == '-') - return '_'; - return c; -} - -static inline int parameq(const char *input, const char *paramname) -{ - unsigned int i; - for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) - if (input[i] == '\0') - return 1; - return 0; -} - -static int parse_one(char *param, - char *val, - struct kernel_param *params, - unsigned num_params, - int (*handle_unknown)(char *param, char *val)) -{ - unsigned int i; - - /* Find parameter */ - for (i = 0; i < num_params; i++) { - if (parameq(param, params[i].name)) { - DEBUGP("They are equal! Calling %p\n", - params[i].set); - return params[i].set(val, ¶ms[i]); - } - } - - if (handle_unknown) { - DEBUGP("Unknown argument: calling %p\n", handle_unknown); - return handle_unknown(param, val); - } - - DEBUGP("Unknown argument `%s'\n", param); - return -ENOENT; -} - -/* You can use " around spaces, but can't escape ". */ -/* Hyphens and underscores equivalent in parameter names. */ -static char *next_arg(char *args, char **param, char **val) -{ - unsigned int i, equals = 0; - int in_quote = 0, quoted = 0; - char *next; - - if (*args == '"') { - args++; - in_quote = 1; - quoted = 1; - } - - for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) - break; - if (equals == 0) { - if (args[i] == '=') - equals = i; - } - if (args[i] == '"') - in_quote = !in_quote; - } - - *param = args; - if (!equals) - *val = NULL; - else { - args[equals] = '\0'; - *val = args + equals + 1; - - /* Don't include quotes in value. */ - if (**val == '"') { - (*val)++; - if (args[i-1] == '"') - args[i-1] = '\0'; - } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; - } - - if (args[i]) { - args[i] = '\0'; - next = args + i + 1; - } else - next = args + i; - - /* Chew up trailing spaces. */ - while (*next == ' ') - next++; - return next; -} - -/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *name, - char *args, - struct kernel_param *params, - unsigned num, - int (*unknown)(char *param, char *val)) -{ - char *param, *val; - - DEBUGP("Parsing ARGS: %s\n", args); - - /* Chew leading spaces */ - while (*args == ' ') - args++; - - while (*args) { - int ret; - int irq_was_disabled; - - args = next_arg(args, ¶m, &val); - irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, unknown); - if (irq_was_disabled && !irqs_disabled()) { - printk(KERN_WARNING "parse_args(): option '%s' enabled " - "irq's!\n", param); - } - switch (ret) { - case -ENOENT: - printk(KERN_ERR "%s: Unknown parameter `%s'\n", - name, param); - return ret; - case -ENOSPC: - printk(KERN_ERR - "%s: `%s' too large for parameter `%s'\n", - name, val ?: "", param); - return ret; - case 0: - break; - default: - printk(KERN_ERR - "%s: `%s' invalid for parameter `%s'\n", - name, val ?: "", param); - return ret; - } - } - - /* All parsed OK. */ - return 0; -} - -/* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ - int param_set_##name(const char *val, struct kernel_param *kp) \ - { \ - tmptype l; \ - int ret; \ - \ - if (!val) return -EINVAL; \ - ret = strtolfn(val, 0, &l); \ - if (ret == -EINVAL || ((type)l != l)) \ - return -EINVAL; \ - *((type *)kp->arg) = l; \ - return 0; \ - } \ - int param_get_##name(char *buffer, struct kernel_param *kp) \ - { \ - return sprintf(buffer, format, *((type *)kp->arg)); \ - } - -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); - -int param_set_charp(const char *val, struct kernel_param *kp) -{ - if (!val) { - printk(KERN_ERR "%s: string parameter expected\n", - kp->name); - return -EINVAL; - } - - if (strlen(val) > 1024) { - printk(KERN_ERR "%s: string parameter too long\n", - kp->name); - return -ENOSPC; - } - - *(char **)kp->arg = (char *)val; - return 0; -} - -int param_get_charp(char *buffer, struct kernel_param *kp) -{ - return sprintf(buffer, "%s", *((char **)kp->arg)); -} - -int param_set_bool(const char *val, struct kernel_param *kp) -{ - /* No equals means "set"... */ - if (!val) val = "1"; - - /* One of =[yYnN01] */ - switch (val[0]) { - case 'y': case 'Y': case '1': - *(int *)kp->arg = 1; - return 0; - case 'n': case 'N': case '0': - *(int *)kp->arg = 0; - return 0; - } - return -EINVAL; -} - -int param_get_bool(char *buffer, struct kernel_param *kp) -{ - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); -} - -int param_set_invbool(const char *val, struct kernel_param *kp) -{ - int boolval, ret; - struct kernel_param dummy; - - dummy.arg = &boolval; - ret = param_set_bool(val, &dummy); - if (ret == 0) - *(int *)kp->arg = !boolval; - return ret; -} - -int param_get_invbool(char *buffer, struct kernel_param *kp) -{ - return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); -} - -/* We break the rule and mangle the string. */ -static int param_array(const char *name, - const char *val, - unsigned int min, unsigned int max, - void *elem, int elemsize, - int (*set)(const char *, struct kernel_param *kp), - unsigned int *num) -{ - int ret; - struct kernel_param kp; - char save; - - /* Get the name right for errors. */ - kp.name = name; - kp.arg = elem; - - /* No equals sign? */ - if (!val) { - printk(KERN_ERR "%s: expects arguments\n", name); - return -EINVAL; - } - - *num = 0; - /* We expect a comma-separated list of values. */ - do { - int len; - - if (*num == max) { - printk(KERN_ERR "%s: can only take %i arguments\n", - name, max); - return -EINVAL; - } - len = strcspn(val, ","); - - /* nul-terminate and parse */ - save = val[len]; - ((char *)val)[len] = '\0'; - ret = set(val, &kp); - - if (ret != 0) - return ret; - kp.arg += elemsize; - val += len+1; - (*num)++; - } while (save == ','); - - if (*num < min) { - printk(KERN_ERR "%s: needs at least %i arguments\n", - name, min); - return -EINVAL; - } - return 0; -} - -int param_array_set(const char *val, struct kernel_param *kp) -{ - const struct kparam_array *arr = kp->arr; - unsigned int temp_num; - - return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); -} - -int param_array_get(char *buffer, struct kernel_param *kp) -{ - int i, off, ret; - const struct kparam_array *arr = kp->arr; - struct kernel_param p; - - p = *kp; - for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { - if (i) - buffer[off++] = ','; - p.arg = arr->elem + arr->elemsize * i; - ret = arr->get(buffer + off, &p); - if (ret < 0) - return ret; - off += ret; - } - buffer[off] = '\0'; - return off; -} - -int param_set_copystring(const char *val, struct kernel_param *kp) -{ - const struct kparam_string *kps = kp->str; - - if (!val) { - printk(KERN_ERR "%s: missing param set value\n", kp->name); - return -EINVAL; - } - if (strlen(val)+1 > kps->maxlen) { - printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", - kp->name, kps->maxlen-1); - return -ENOSPC; - } - strcpy(kps->string, val); - return 0; -} - -int param_get_string(char *buffer, struct kernel_param *kp) -{ - const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); -} - -/* sysfs output in /sys/modules/XYZ/parameters/ */ - -extern struct kernel_param __start___param[], __stop___param[]; - -struct param_attribute -{ - struct module_attribute mattr; - struct kernel_param *param; -}; - -struct module_param_attrs -{ - struct attribute_group grp; - struct param_attribute attrs[0]; -}; - -#ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr); - -static ssize_t param_attr_show(struct module_attribute *mattr, - struct module *mod, char *buf) -{ - int count; - struct param_attribute *attribute = to_param_attr(mattr); - - if (!attribute->param->get) - return -EPERM; - - count = attribute->param->get(buf, attribute->param); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } - return count; -} - -/* sysfs always hands a nul-terminated string in buf. We rely on that. */ -static ssize_t param_attr_store(struct module_attribute *mattr, - struct module *owner, - const char *buf, size_t len) -{ - int err; - struct param_attribute *attribute = to_param_attr(mattr); - - if (!attribute->param->set) - return -EPERM; - - err = attribute->param->set(buf, attribute->param); - if (!err) - return len; - return err; -} -#endif - -#ifdef CONFIG_MODULES -#define __modinit -#else -#define __modinit __init -#endif - -#ifdef CONFIG_SYSFS -/* - * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME - * @mk: struct module_kobject (contains parent kobject) - * @kparam: array of struct kernel_param, the actual parameter definitions - * @num_params: number of entries in array - * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules" - * - * Create a kobject for a (per-module) group of parameters, and create files - * in sysfs. A pointer to the param_kobject is returned on success, - * NULL if there's no parameter to export, or other ERR_PTR(err). - */ -static __modinit struct module_param_attrs * -param_sysfs_setup(struct module_kobject *mk, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) -{ - struct module_param_attrs *mp; - unsigned int valid_attrs = 0; - unsigned int i, size[2]; - struct param_attribute *pattr; - struct attribute **gattr; - int err; - - for (i=0; iattrs[0]), - sizeof(mp->grp.attrs[0])); - size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); - - mp = kzalloc(size[0] + size[1], GFP_KERNEL); - if (!mp) - return ERR_PTR(-ENOMEM); - - mp->grp.name = "parameters"; - mp->grp.attrs = (void *)mp + size[0]; - - pattr = &mp->attrs[0]; - gattr = &mp->grp.attrs[0]; - for (i = 0; i < num_params; i++) { - struct kernel_param *kp = &kparam[i]; - if (kp->perm) { - pattr->param = kp; - pattr->mattr.show = param_attr_show; - pattr->mattr.store = param_attr_store; - pattr->mattr.attr.name = (char *)&kp->name[name_skip]; - pattr->mattr.attr.mode = kp->perm; - *(gattr++) = &(pattr++)->mattr.attr; - } - } - *gattr = NULL; - - if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { - kfree(mp); - return ERR_PTR(err); - } - return mp; -} - -#ifdef CONFIG_MODULES -/* - * module_param_sysfs_setup - setup sysfs support for one module - * @mod: module - * @kparam: module parameters (array) - * @num_params: number of module parameters - * - * Adds sysfs entries for module parameters, and creates a link from - * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ - */ -int module_param_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) -{ - struct module_param_attrs *mp; - - mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); - if (IS_ERR(mp)) - return PTR_ERR(mp); - - mod->param_attrs = mp; - return 0; -} - -/* - * module_param_sysfs_remove - remove sysfs support for one module - * @mod: module - * - * Remove sysfs entries for module parameters and the corresponding - * kobject. - */ -void module_param_sysfs_remove(struct module *mod) -{ - if (mod->param_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->param_attrs->grp); - /* We are positive that no one is using any param - * attrs at this point. Deallocate immediately. */ - kfree(mod->param_attrs); - mod->param_attrs = NULL; - } -} -#endif - -/* - * kernel_param_sysfs_setup - wrapper for built-in params support - */ -static void __init kernel_param_sysfs_setup(const char *name, - struct kernel_param *kparam, - unsigned int num_params, - unsigned int name_skip) -{ - struct module_kobject *mk; - int ret; - - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); - if (ret) { - kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed to be added to sysfs, " - "error number %d\n", name, ret); - printk(KERN_ERR "The system will be unstable now.\n"); - return; - } - param_sysfs_setup(mk, kparam, num_params, name_skip); - kobject_uevent(&mk->kobj, KOBJ_ADD); -} - -/* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules - * - * Add module_parameters to sysfs for "modules" built into the kernel. - * - * The "module" name (KBUILD_MODNAME) is stored before a dot, the - * "parameter" name is stored behind a dot in kernel_param->name. So, - * extract the "module" name for all built-in kernel_param-eters, - * and for all who have the same, call kernel_param_sysfs_setup. - */ -static void __init param_sysfs_builtin(void) -{ - struct kernel_param *kp, *kp_begin = NULL; - unsigned int i, name_len, count = 0; - char modname[MODULE_NAME_LEN + 1] = ""; - - for (i=0; i < __stop___param - __start___param; i++) { - char *dot; - size_t max_name_len; - - kp = &__start___param[i]; - max_name_len = - min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); - - dot = memchr(kp->name, '.', max_name_len); - if (!dot) { - DEBUGP("couldn't find period in first %d characters " - "of %s\n", MODULE_NAME_LEN, kp->name); - continue; - } - name_len = dot - kp->name; - - /* new kbuild_modname? */ - if (strlen(modname) != name_len - || strncmp(modname, kp->name, name_len) != 0) { - /* add a new kobject for previous kernel_params. */ - if (count) - kernel_param_sysfs_setup(modname, - kp_begin, - count, - strlen(modname)+1); - - strncpy(modname, kp->name, name_len); - modname[name_len] = '\0'; - count = 0; - kp_begin = kp; - } - count++; - } - - /* last kernel_params need to be registered as well */ - if (count) - kernel_param_sysfs_setup(modname, kp_begin, count, - strlen(modname)+1); -} - - -/* module-related sysfs stuff */ - -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); - -static ssize_t module_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct module_attribute *attribute; - struct module_kobject *mk; - int ret; - - attribute = to_module_attr(attr); - mk = to_module_kobject(kobj); - - if (!attribute->show) - return -EIO; - - ret = attribute->show(attribute, mk->mod, buf); - - return ret; -} - -static ssize_t module_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct module_attribute *attribute; - struct module_kobject *mk; - int ret; - - attribute = to_module_attr(attr); - mk = to_module_kobject(kobj); - - if (!attribute->store) - return -EIO; - - ret = attribute->store(attribute, mk->mod, buf, len); - - return ret; -} - -static struct sysfs_ops module_sysfs_ops = { - .show = module_attr_show, - .store = module_attr_store, -}; - -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &module_ktype) - return 1; - return 0; -} - -static struct kset_uevent_ops module_uevent_ops = { - .filter = uevent_filter, -}; - -struct kset *module_kset; -int module_sysfs_initialized; - -struct kobj_type module_ktype = { - .sysfs_ops = &module_sysfs_ops, -}; - -/* - * param_sysfs_init - wrapper for built-in params support - */ -static int __init param_sysfs_init(void) -{ - module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); - if (!module_kset) { - printk(KERN_WARNING "%s (%d): error creating kset\n", - __FILE__, __LINE__); - return -ENOMEM; - } - module_sysfs_initialized = 1; - - param_sysfs_builtin(); - - return 0; -} -subsys_initcall(param_sysfs_init); - -#endif /* CONFIG_SYSFS */ - -EXPORT_SYMBOL(param_set_byte); -EXPORT_SYMBOL(param_get_byte); -EXPORT_SYMBOL(param_set_short); -EXPORT_SYMBOL(param_get_short); -EXPORT_SYMBOL(param_set_ushort); -EXPORT_SYMBOL(param_get_ushort); -EXPORT_SYMBOL(param_set_int); -EXPORT_SYMBOL(param_get_int); -EXPORT_SYMBOL(param_set_uint); -EXPORT_SYMBOL(param_get_uint); -EXPORT_SYMBOL(param_set_long); -EXPORT_SYMBOL(param_get_long); -EXPORT_SYMBOL(param_set_ulong); -EXPORT_SYMBOL(param_get_ulong); -EXPORT_SYMBOL(param_set_charp); -EXPORT_SYMBOL(param_get_charp); -EXPORT_SYMBOL(param_set_bool); -EXPORT_SYMBOL(param_get_bool); -EXPORT_SYMBOL(param_set_invbool); -EXPORT_SYMBOL(param_get_invbool); -EXPORT_SYMBOL(param_array_set); -EXPORT_SYMBOL(param_array_get); -EXPORT_SYMBOL(param_set_copystring); -EXPORT_SYMBOL(param_get_string); -/* - * Generic pidhash and scalable, time-bounded PID allocator - * - * (C) 2002-2003 William Irwin, IBM - * (C) 2004 William Irwin, Oracle - * (C) 2002-2004 Ingo Molnar, Red Hat - * - * pid-structures are backing objects for tasks sharing a given ID to chain - * against. There is very little to them aside from hashing them and - * parking tasks using given ID's on a list. - * - * The hash is always changed with the tasklist_lock write-acquired, - * and the hash is only accessed with the tasklist_lock at least - * read-acquired, so there's no additional SMP locking needed here. - * - * We have a list of bitmap pages, which bitmaps represent the PID space. - * Allocating and freeing PIDs is completely lockless. The worst-case - * allocation scenario when all but one out of 1 million PIDs possible are - * allocated already: the scanning of 32 list entries and at most PAGE_SIZE - * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). - * - * Pid namespaces: - * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. - * (C) 2007 Sukadev Bhattiprolu , IBM - * Many thanks to Oleg Nesterov for comments and help - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static int pidhash_shift; -struct pid init_struct_pid = INIT_STRUCT_PID; - -int pid_max = PID_MAX_DEFAULT; - -#define RESERVED_PIDS 300 - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; - -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) - -/* - * PID-map pages start out as NULL, they get allocated upon - * first use and are never deallocated. This way a low pid_max - * value does not cause lots of bitmaps to be allocated, but - * the scheme scales to up to 4 million PIDs, runtime. - */ -struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, - .level = 0, - .child_reaper = &init_task, -}; -EXPORT_SYMBOL_GPL(init_pid_ns); - -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); - -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (map->page) - kfree(page); - else - map->page = page; - spin_unlock_irq(&pidmap_lock); - if (unlikely(!map->page)) - break; - } - if (likely(atomic_read(&map->nr_free))) { - do { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - pid_ns->last_pid = pid; - return pid; - } - offset = find_next_offset(map, offset); - pid = mk_pid(pid_ns, map, offset); - /* - * find_next_offset() found a bit, the pid from it - * is in-bounds, and if we fell back to the last - * bitmap block and the final block was the same - * as the starting point, pid is before last_pid. - */ - } while (offset < BITS_PER_PAGE && pid < pid_max && - (i != max_scan || pid < last || - !((last+1) & BITS_PER_PAGE_MASK))); - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -1; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - -void put_pid(struct pid *pid) -{ - struct pid_namespace *ns; - - if (!pid) - return; - - ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { - kmem_cache_free(ns->pid_cachep, pid); - put_pid_ns(ns); - } -} -EXPORT_SYMBOL_GPL(put_pid); - -static void delayed_put_pid(struct rcu_head *rhp) -{ - struct pid *pid = container_of(rhp, struct pid, rcu); - put_pid(pid); -} - -void free_pid(struct pid *pid) -{ - /* We can be called with write_lock_irq(&tasklist_lock) held */ - int i; - unsigned long flags; - - spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); - spin_unlock_irqrestore(&pidmap_lock, flags); - - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - - call_rcu(&pid->rcu, delayed_put_pid); -} - -struct pid *alloc_pid(struct pid_namespace *ns) -{ - struct pid *pid; - enum pid_type type; - int i, nr; - struct pid_namespace *tmp; - struct upid *upid; - - pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); - if (!pid) - goto out; - - tmp = ns; - for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); - if (nr < 0) - goto out_free; - - pid->numbers[i].nr = nr; - pid->numbers[i].ns = tmp; - tmp = tmp->parent; - } - - get_pid_ns(ns); - pid->level = ns->level; - atomic_set(&pid->count, 1); - for (type = 0; type < PIDTYPE_MAX; ++type) - INIT_HLIST_HEAD(&pid->tasks[type]); - - spin_lock_irq(&pidmap_lock); - for (i = ns->level; i >= 0; i--) { - upid = &pid->numbers[i]; - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - } - spin_unlock_irq(&pidmap_lock); - -out: - return pid; - -out_free: - while (++i <= ns->level) - free_pidmap(pid->numbers + i); - - kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; -} - -struct pid *find_pid_ns(int nr, struct pid_namespace *ns) -{ - struct hlist_node *elem; - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, elem, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; -} -EXPORT_SYMBOL_GPL(find_pid_ns); - -struct pid *find_vpid(int nr) -{ - return find_pid_ns(nr, current->nsproxy->pid_ns); -} -EXPORT_SYMBOL_GPL(find_vpid); - -/* - * attach_pid() must be called with the tasklist_lock write-held. - */ -void attach_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) -{ - struct pid_link *link; - - link = &task->pids[type]; - link->pid = pid; - hlist_add_head_rcu(&link->node, &pid->tasks[type]); -} - -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) -{ - struct pid_link *link; - struct pid *pid; - int tmp; - - link = &task->pids[type]; - pid = link->pid; - - hlist_del_rcu(&link->node); - link->pid = new; - - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) - return; - - free_pid(pid); -} - -void detach_pid(struct task_struct *task, enum pid_type type) -{ - __change_pid(task, type, NULL); -} - -void change_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) -{ - __change_pid(task, type, pid); - attach_pid(task, type, pid); -} - -/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ -void transfer_pid(struct task_struct *old, struct task_struct *new, - enum pid_type type) -{ - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); -} - -struct task_struct *pid_task(struct pid *pid, enum pid_type type) -{ - struct task_struct *result = NULL; - if (pid) { - struct hlist_node *first; - first = rcu_dereference(pid->tasks[type].first); - if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); - } - return result; -} -EXPORT_SYMBOL(pid_task); - -/* - * Must be called under rcu_read_lock() or with tasklist_lock read-held. - */ -struct task_struct *find_task_by_pid_type_ns(int type, int nr, - struct pid_namespace *ns) -{ - return pid_task(find_pid_ns(nr, ns), type); -} - -EXPORT_SYMBOL(find_task_by_pid_type_ns); - -struct task_struct *find_task_by_vpid(pid_t vnr) -{ - return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, - current->nsproxy->pid_ns); -} -EXPORT_SYMBOL(find_task_by_vpid); - -struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) -{ - return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns); -} -EXPORT_SYMBOL(find_task_by_pid_ns); - -struct pid *get_task_pid(struct task_struct *task, enum pid_type type) -{ - struct pid *pid; - rcu_read_lock(); - pid = get_pid(task->pids[type].pid); - rcu_read_unlock(); - return pid; -} - -struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) -{ - struct task_struct *result; - rcu_read_lock(); - result = pid_task(pid, type); - if (result) - get_task_struct(result); - rcu_read_unlock(); - return result; -} - -struct pid *find_get_pid(pid_t nr) -{ - struct pid *pid; - - rcu_read_lock(); - pid = get_pid(find_vpid(nr)); - rcu_read_unlock(); - - return pid; -} -EXPORT_SYMBOL_GPL(find_get_pid); - -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) -{ - struct upid *upid; - pid_t nr = 0; - - if (pid && ns->level <= pid->level) { - upid = &pid->numbers[ns->level]; - if (upid->ns == ns) - nr = upid->nr; - } - return nr; -} - -pid_t pid_vnr(struct pid *pid) -{ - return pid_nr_ns(pid, current->nsproxy->pid_ns); -} -EXPORT_SYMBOL_GPL(pid_vnr); - -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_pid(tsk), ns); -} -EXPORT_SYMBOL(task_pid_nr_ns); - -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_pgrp(tsk), ns); -} -EXPORT_SYMBOL(task_pgrp_nr_ns); - -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_session(tsk), ns); -} -EXPORT_SYMBOL(task_session_nr_ns); - -/* - * Used by proc to find the first pid that is greater then or equal to nr. - * - * If there is a pid at nr this function is exactly the same as find_pid_ns. - */ -struct pid *find_ge_pid(int nr, struct pid_namespace *ns) -{ - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; -} - -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); - pidhash_size = 1 << pidhash_shift; - - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); -} - -void __init pidmap_init(void) -{ - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); - - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); -} -/* - * Pid namespaces - * - * Authors: - * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. - * (C) 2007 Sukadev Bhattiprolu , IBM - * Many thanks to Oleg Nesterov for comments and help - * - */ - -#include -#include -#include -#include -#include - -#define BITS_PER_PAGE (PAGE_SIZE*8) - -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); -static DEFINE_MUTEX(pid_caches_mutex); -static struct kmem_cache *pid_ns_cachep; - -/* - * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry - */ - -static struct kmem_cache *create_pid_cachep(int nr_ids) -{ - struct pid_cache *pcache; - struct kmem_cache *cachep; - - mutex_lock(&pid_caches_mutex); - list_for_each_entry(pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: - mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; -} - -static struct pid_namespace *create_pid_namespace(unsigned int level) -{ - struct pid_namespace *ns; - int i; - - ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); - if (ns == NULL) - goto out; - - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; - - ns->pid_cachep = create_pid_cachep(level + 1); - if (ns->pid_cachep == NULL) - goto out_free_map; - - kref_init(&ns->kref); - ns->level = level; - - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - - return ns; - -out_free_map: - kfree(ns->pidmap[0].page); -out_free: - kmem_cache_free(pid_ns_cachep, ns); -out: - return ERR_PTR(-ENOMEM); -} - -static void destroy_pid_namespace(struct pid_namespace *ns) -{ - int i; - - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); - kmem_cache_free(pid_ns_cachep, ns); -} - -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) -{ - struct pid_namespace *new_ns; - - BUG_ON(!old_ns); - new_ns = get_pid_ns(old_ns); - if (!(flags & CLONE_NEWPID)) - goto out; - - new_ns = ERR_PTR(-EINVAL); - if (flags & CLONE_THREAD) - goto out_put; - - new_ns = create_pid_namespace(old_ns->level + 1); - if (!IS_ERR(new_ns)) - new_ns->parent = get_pid_ns(old_ns); - -out_put: - put_pid_ns(old_ns); -out: - return new_ns; -} - -void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns, *parent; - - ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; - destroy_pid_namespace(ns); - - if (parent != NULL) - put_pid_ns(parent); -} - -void zap_pid_ns_processes(struct pid_namespace *pid_ns) -{ - int nr; - int rc; - - /* - * The last thread in the cgroup-init thread group is terminating. - * Find remaining pid_ts in the namespace, signal and wait for them - * to exit. - * - * Note: This signals each threads in the namespace - even those that - * belong to the same thread group, To avoid this, we would have - * to walk the entire tasklist looking a processes in this - * namespace, but that could be unnecessarily expensive if the - * pid namespace has just a few processes. Or we need to - * maintain a tasklist for each pid namespace. - * - */ - read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); - nr = next_pidmap(pid_ns, nr); - } - read_unlock(&tasklist_lock); - - do { - clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); - } while (rc != -ECHILD); - - acct_exit_ns(pid_ns); - return; -} - -static __init int pid_namespaces_init(void) -{ - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); - return 0; -} - -__initcall(pid_namespaces_init); -/* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: - * - * Dependents on a QoS value : register requirements - * Watchers of QoS value : get notified when target QoS value changes - * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. - * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) - * - * There are lists of pm_qos_objects each one wrapping requirements, notifiers - * - * User mode requirements on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * requirement is removed and a new qos target is computed. This way when the - * requirement that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * locking rule: all changes to requirements or notifiers lists - * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock - * held, taken with _irqsave. One lock to rule them all - */ -struct requirement_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - char *name; -}; - -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - -struct pm_qos_object { - struct requirement_list requirements; - struct blocking_notifier_head *notifiers; - struct miscdevice pm_qos_power_miscdev; - char *name; - s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); -}; - -static struct pm_qos_object null_pm_qos; -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_object cpu_dma_pm_qos = { - .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, - .notifiers = &cpu_dma_lat_notifier, - .name = "cpu_dma_latency", - .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare -}; - -static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_object network_lat_pm_qos = { - .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, - .notifiers = &network_lat_notifier, - .name = "network_latency", - .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare -}; - - -static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_object network_throughput_pm_qos = { - .requirements = - {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)}, - .notifiers = &network_throughput_notifier, - .name = "network_throughput", - .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare -}; - - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, - &network_lat_pm_qos, - &network_throughput_pm_qos -}; - -static DEFINE_SPINLOCK(pm_qos_lock); - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .open = pm_qos_power_open, - .release = pm_qos_power_release, -}; - -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) -{ - return max(v1, v2); -} - -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} - - -static void update_target(int target) -{ - s32 extreme_value; - struct requirement_list *node; - unsigned long flags; - int call_notifier = 0; - - spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[target]->default_value; - list_for_each_entry(node, - &pm_qos_array[target]->requirements.list, list) { - extreme_value = pm_qos_array[target]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[target]->target_value, extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - atomic_read(&pm_qos_array[target]->target_value)); - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - - if (call_notifier) - blocking_notifier_call_chain(pm_qos_array[target]->notifiers, - (unsigned long) extreme_value, NULL); -} - -static int register_pm_qos_misc(struct pm_qos_object *qos) -{ - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - return misc_register(&qos->pm_qos_power_miscdev); -} - -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; - - for (pm_qos_class = 0; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; - } - return -1; -} - -/** - * pm_qos_requirement - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value in an atomic manner. - */ -int pm_qos_requirement(int pm_qos_class) -{ - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); -} -EXPORT_SYMBOL_GPL(pm_qos_requirement); - -/** - * pm_qos_add_requirement - inserts new qos request into the list - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request - * @value: defines the qos request - * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters. - */ -int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) -{ - struct requirement_list *dep; - unsigned long flags; - - dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->name = kstrdup(name, GFP_KERNEL); - if (!dep->name) - goto cleanup; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requirements.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); - - return 0; - } - -cleanup: - kfree(dep); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(pm_qos_add_requirement); - -/** - * pm_qos_update_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request - * @value: defines the qos request - * - * Updates an existing qos requirement for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. - * - * If the named request isn't in the list then no change is made. - */ -int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) -{ - unsigned long flags; - struct requirement_list *node; - int pending_update = 0; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - if (new_value == PM_QOS_DEFAULT_VALUE) - node->value = - pm_qos_array[pm_qos_class]->default_value; - else - node->value = new_value; - pending_update = 1; - break; - } - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); - - return 0; -} -EXPORT_SYMBOL_GPL(pm_qos_update_requirement); - -/** - * pm_qos_remove_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request - * - * Will remove named qos request from pm_qos_class list of parameters and - * recompute the current target value for the pm_qos_class. - */ -void pm_qos_remove_requirement(int pm_qos_class, char *name) -{ - unsigned long flags; - struct requirement_list *node; - int pending_update = 0; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - kfree(node->name); - list_del(&node->list); - kfree(node); - pending_update = 1; - break; - } - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); -} -EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); - -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ - int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->notifiers, notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); - -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->notifiers, notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); - -#define PID_NAME_LEN sizeof("process_1234567890") -static char name[PID_NAME_LEN]; - -static int pm_qos_power_open(struct inode *inode, struct file *filp) -{ - int ret; - long pm_qos_class; - - lock_kernel(); - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= 0) { - filp->private_data = (void *)pm_qos_class; - sprintf(name, "process_%d", current->pid); - ret = pm_qos_add_requirement(pm_qos_class, name, - PM_QOS_DEFAULT_VALUE); - if (ret >= 0) { - unlock_kernel(); - return 0; - } - } - unlock_kernel(); - - return -EPERM; -} - -static int pm_qos_power_release(struct inode *inode, struct file *filp) -{ - int pm_qos_class; - - pm_qos_class = (long)filp->private_data; - sprintf(name, "process_%d", current->pid); - pm_qos_remove_requirement(pm_qos_class, name); - - return 0; -} - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) -{ - s32 value; - int pm_qos_class; - - pm_qos_class = (long)filp->private_data; - if (count != sizeof(s32)) - return -EINVAL; - if (copy_from_user(&value, buf, sizeof(s32))) - return -EFAULT; - sprintf(name, "process_%d", current->pid); - pm_qos_update_requirement(pm_qos_class, name, value); - - return sizeof(s32); -} - - -static int __init pm_qos_power_init(void) -{ - int ret = 0; - - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); - - return ret; -} - -late_initcall(pm_qos_power_init); -/* - * Implement CPU time clocks for the POSIX clock interface. - */ - -#include -#include -#include -#include -#include - -static int check_clock(const clockid_t which_clock) -{ - int error = 0; - struct task_struct *p; - const pid_t pid = CPUCLOCK_PID(which_clock); - - if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - if (pid == 0) - return 0; - - read_lock(&tasklist_lock); - p = find_task_by_vpid(pid); - if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : thread_group_leader(p))) { - error = -EINVAL; - } - read_unlock(&tasklist_lock); - - return error; -} - -static inline union cpu_time_count -timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) -{ - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; - } else { - ret.cpu = timespec_to_cputime(tp); - } - return ret; -} - -static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, - struct timespec *tp) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); - else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return cputime_lt(now.cpu, then.cpu); - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu = cputime_add(acc->cpu, val.cpu); - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu = cputime_sub(a.cpu, b.cpu); - } - return a; -} - -/* - * Divide and limit the result to res >= 1 - * - * This is necessary to prevent signal delivery starvation, when the result of - * the division would be rounded down to 0. - */ -static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) -{ - cputime_t res = cputime_div(time, div); - - return max_t(cputime_t, res, 1); -} - -/* - * Update expiry time from increment, and increase overrun count, - * given the current clock sample. - */ -static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) -{ - int i; - - if (timer->it.cpu.incr.sched == 0) - return; - - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; - - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; - - if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) - return; - incr = timer->it.cpu.incr.cpu; - delta = cputime_sub(cputime_add(now.cpu, incr), - timer->it.cpu.expires.cpu); - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) - incr = cputime_add(incr, incr); - for (; i >= 0; incr = cputime_halve(incr), i--) { - if (cputime_lt(delta, incr)) - continue; - timer->it.cpu.expires.cpu = - cputime_add(timer->it.cpu.expires.cpu, incr); - timer->it_overrun += 1 << i; - delta = cputime_sub(delta, incr); - } - } -} - -static inline cputime_t prof_ticks(struct task_struct *p) -{ - return cputime_add(p->utime, p->stime); -} -static inline cputime_t virt_ticks(struct task_struct *p) -{ - return p->utime; -} -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} - -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) -{ - int error = check_clock(which_clock); - if (!error) { - tp->tv_sec = 0; - tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - /* - * If sched_clock is using a cycle counter, we - * don't have any idea of its true resolution - * exported, but it is much more than 1s/HZ. - */ - tp->tv_nsec = 1; - } - } - return error; -} - -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) -{ - /* - * You can never reset a CPU clock, but we check for other errors - * in the call before failing with EPERM. - */ - int error = check_clock(which_clock); - if (error == 0) { - error = -EPERM; - } - return error; -} - - -/* - * Sample a per-thread clock for the given task. - */ -static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) -{ - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); - break; - case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); - break; - case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); - break; - } - return 0; -} - -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. - */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) -{ - struct task_struct *t = p; - switch (clock_idx) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); - break; - } - return 0; -} - -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - - -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) -{ - const pid_t pid = CPUCLOCK_PID(which_clock); - int error = -EINVAL; - union cpu_time_count rtn; - - if (pid == 0) { - /* - * Special case constant value for our own clocks. - * We don't have to do any lookup to find ourselves. - */ - if (CPUCLOCK_PERTHREAD(which_clock)) { - /* - * Sampling just ourselves we can do with no locking. - */ - error = cpu_clock_sample(which_clock, - current, &rtn); - } else { - read_lock(&tasklist_lock); - error = cpu_clock_sample_group(which_clock, - current, &rtn); - read_unlock(&tasklist_lock); - } - } else { - /* - * Find the given PID, and validate that the caller - * should be able to see it. - */ - struct task_struct *p; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) { - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(p, current)) { - error = cpu_clock_sample(which_clock, - p, &rtn); - } - } else { - read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->signal) { - error = - cpu_clock_sample_group(which_clock, - p, &rtn); - } - read_unlock(&tasklist_lock); - } - } - rcu_read_unlock(); - } - - if (error) - return error; - sample_to_timespec(which_clock, rtn, tp); - return 0; -} - - -/* - * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create with the new timer already locked. - */ -int posix_cpu_timer_create(struct k_itimer *new_timer) -{ - int ret = 0; - const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); - struct task_struct *p; - - if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - INIT_LIST_HEAD(&new_timer->it.cpu.entry); - new_timer->it.cpu.incr.sched = 0; - new_timer->it.cpu.expires.sched = 0; - - read_lock(&tasklist_lock); - if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { - if (pid == 0) { - p = current; - } else { - p = find_task_by_vpid(pid); - if (p && !same_thread_group(p, current)) - p = NULL; - } - } else { - if (pid == 0) { - p = current->group_leader; - } else { - p = find_task_by_vpid(pid); - if (p && !thread_group_leader(p)) - p = NULL; - } - } - new_timer->it.cpu.task = p; - if (p) { - get_task_struct(p); - } else { - ret = -EINVAL; - } - read_unlock(&tasklist_lock); - - return ret; -} - -/* - * Clean up a CPU-clock timer that is about to be destroyed. - * This is called from timer deletion with the timer already locked. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -int posix_cpu_timer_del(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - int ret = 0; - - if (likely(p != NULL)) { - read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { - /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. - */ - BUG_ON(!list_empty(&timer->it.cpu.entry)); - } else { - spin_lock(&p->sighand->siglock); - if (timer->it.cpu.firing) - ret = TIMER_RETRY; - else - list_del(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!ret) - put_task_struct(p); - } - - return ret; -} - -/* - * Clean out CPU timers still ticking when a thread exited. The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. - * This must be called with the siglock held. - */ -static void cleanup_timers(struct list_head *head, - cputime_t utime, cputime_t stime, - unsigned long long sum_exec_runtime) -{ - struct cpu_timer_list *timer, *next; - cputime_t ptime = cputime_add(utime, stime); - - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, ptime)) { - timer->expires.cpu = cputime_zero; - } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - ptime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, utime)) { - timer->expires.cpu = cputime_zero; - } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - utime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; - } else { - timer->expires.sched -= sum_exec_runtime; - } - } -} - -/* - * These are both called with the siglock held, when the current thread - * is being reaped. When the final (leader) thread in the group is reaped, - * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. - */ -void posix_cpu_timers_exit(struct task_struct *tsk) -{ - cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); - -} -void posix_cpu_timers_exit_group(struct task_struct *tsk) -{ - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} - - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; - - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } -} - -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -{ - /* - * That's all for this thread or process. - * We leave our residual in expires to be reported. - */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); -} - -/* - * Insert the timer on the appropriate list before any timers that - * expire later. This must be called with the tasklist_lock held - * for reading, and interrupts disabled. - */ -static void arm_timer(struct k_itimer *timer, union cpu_time_count now) -{ - struct task_struct *p = timer->it.cpu.task; - struct list_head *head, *listpos; - struct cpu_timer_list *const nt = &timer->it.cpu; - struct cpu_timer_list *next; - unsigned long i; - - head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? - p->cpu_timers : p->signal->cpu_timers); - head += CPUCLOCK_WHICH(timer->it_clock); - - BUG_ON(!irqs_disabled()); - spin_lock(&p->sighand->siglock); - - listpos = head; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - list_for_each_entry(next, head, entry) { - if (next->expires.sched > nt->expires.sched) - break; - listpos = &next->entry; - } - } else { - list_for_each_entry(next, head, entry) { - if (cputime_gt(next->expires.cpu, nt->expires.cpu)) - break; - listpos = &next->entry; - } - } - list_add(&nt->entry, listpos); - - if (listpos == head) { - /* - * We are the new earliest-expiring timer. - * If we are a thread timer, there can always - * be a process timer telling us to stop earlier. - */ - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, - cputime_zero) || - cputime_gt(p->it_prof_expires, - nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; - break; - case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, - cputime_zero) || - cputime_gt(p->it_virt_expires, - nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; - break; - case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; - break; - } - } else { - /* - * For a process timer, we must balance - * all the live threads' expirations. - */ - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) - break; - goto rebalance; - case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) - break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; - if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) - break; - goto rebalance; - case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); - break; - } - } - } - - spin_unlock(&p->sighand->siglock); -} - -/* - * The timer is locked, fire it and arrange for its reload. - */ -static void cpu_timer_fire(struct k_itimer *timer) -{ - if (unlikely(timer->sigq == NULL)) { - /* - * This a special case for clock_nanosleep, - * not a normal timer from sys_timer_create. - */ - wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ - posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_schedule(timer); - } -} - -/* - * Guts of sys_timer_settime for CPU timers. - * This is called with the timer locked and interrupts disabled. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) -{ - struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, val; - int ret; - - if (unlikely(p == NULL)) { - /* - * Timer refers to a dead task's clock. - */ - return -ESRCH; - } - - new_expires = timespec_to_sample(timer->it_clock, &new->it_value); - - read_lock(&tasklist_lock); - /* - * We need the tasklist_lock to protect against reaping that - * clears p->signal. If p has just been reaped, we can no - * longer get any information about it at all. - */ - if (unlikely(p->signal == NULL)) { - read_unlock(&tasklist_lock); - put_task_struct(p); - timer->it.cpu.task = NULL; - return -ESRCH; - } - - /* - * Disarm any old timer after extracting its expiry time. - */ - BUG_ON(!irqs_disabled()); - - ret = 0; - spin_lock(&p->sighand->siglock); - old_expires = timer->it.cpu.expires; - if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; - ret = TIMER_RETRY; - } else - list_del_init(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); - - /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &val); - } else { - cpu_clock_sample_group(timer->it_clock, p, &val); - } - - if (old) { - if (old_expires.sched == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has - * overrun already. If it has, - * we'll report it as having overrun - * and with the next reloaded timer - * already ticking, though we are - * swallowing that pending - * notification here to install the - * new setting. - */ - bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); - sample_to_timespec(timer->it_clock, - old_expires, - &old->it_value); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } - } - - if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ - read_unlock(&tasklist_lock); - goto out; - } - - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); - } - - /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). - */ - timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && - cpu_time_before(timer->it_clock, val, new_expires)) { - arm_timer(timer, val); - } - - read_unlock(&tasklist_lock); - - /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. - */ - timer->it.cpu.incr = timespec_to_sample(timer->it_clock, - &new->it_interval); - - /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. - */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && - !cpu_time_before(timer->it_clock, val, new_expires)) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } - - ret = 0; - out: - if (old) { - sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &old->it_interval); - } - return ret; -} - -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) -{ - union cpu_time_count now; - struct task_struct *p = timer->it.cpu.task; - int clear_dead; - - /* - * Easy part: convert the reload time. - */ - sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &itp->it_interval); - - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - - if (unlikely(p == NULL)) { - /* - * This task already died and the timer will never fire. - * In this case, expires is actually the dead value. - */ - dead: - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); - return; - } - - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - clear_dead = p->exit_state; - } else { - read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Call the timer disarmed, nothing else to do. - */ - put_task_struct(p); - timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; - read_unlock(&tasklist_lock); - goto dead; - } else { - cpu_clock_sample_group(timer->it_clock, p, &now); - clear_dead = (unlikely(p->exit_state) && - thread_group_empty(p)); - } - read_unlock(&tasklist_lock); - } - - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - if (timer->it.cpu.incr.sched == 0 && - cpu_time_before(timer->it_clock, - timer->it.cpu.expires, now)) { - /* - * Do-nothing timer expired and has no reload, - * so it's as if it was never set. - */ - timer->it.cpu.expires.sched = 0; - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - /* - * Account for any expirations and reloads that should - * have happened. - */ - bump_cpu_timer(timer, now); - } - - if (unlikely(clear_dead)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto dead; - } - - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { - sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), - &itp->it_value); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; - } -} - -/* - * Check for any per-thread CPU timers that have fired and move them off - * the tsk->cpu_timers[N] list onto the firing list. Here we update the - * tsk->it_*_expires values to reflect the remaining thread CPU timers. - */ -static void check_thread_timers(struct task_struct *tsk, - struct list_head *firing) -{ - int maxfire; - struct list_head *timers = tsk->cpu_timers; - struct signal_struct *const sig = tsk->signal; - - maxfire = 20; - tsk->it_prof_expires = cputime_zero; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - ++timers; - maxfire = 20; - tsk->it_virt_expires = cputime_zero; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - ++timers; - maxfire = 20; - tsk->it_sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - /* - * Check for the special case thread timers. - */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { - unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; - unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; - - if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur - < sig->rlim[RLIMIT_RTTIME].rlim_max) { - sig->rlim[RLIMIT_RTTIME].rlim_cur += - USEC_PER_SEC; - } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -} - -/* - * Check for any per-thread CPU timers that have fired and move them - * off the tsk->*_timers list onto the firing list. Per-thread timers - * have already been taken off. - */ -static void check_process_timers(struct task_struct *tsk, - struct list_head *firing) -{ - int maxfire; - struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; - unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; - struct list_head *timers = sig->cpu_timers; - - /* - * Don't sample the current process CPU clocks if there are no timers. - */ - if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && - sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && - list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) - return; - - /* - * Collect the current process totals. - */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - - maxfire = 20; - prof_expires = cputime_zero; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { - prof_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - virt_expires = cputime_zero; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { - virt_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - /* - * Check for the special case process timers. - */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - unsigned long psecs = cputime_to_secs(ptime); - cputime_t x; - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (sig->rlim[RLIMIT_CPU].rlim_cur - < sig->rlim[RLIMIT_CPU].rlim_max) { - sig->rlim[RLIMIT_CPU].rlim_cur++; - } - } - x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - if (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(x, prof_expires)) { - prof_expires = x; - } - } - - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } -} - -/* - * This is called from the signal code (via do_schedule_next_timer) - * when the last timer signal was delivered and we have to reload the timer. - */ -void posix_cpu_timer_schedule(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; - - if (unlikely(p == NULL)) - /* - * The task was cleaned up already, no future firings. - */ - goto out; - - /* - * Fetch the current sample and update the timer's expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) { - clear_dead_task(timer, now); - goto out; - } - read_lock(&tasklist_lock); /* arm_timer needs it. */ - } else { - read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - put_task_struct(p); - timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; - goto out_unlock; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto out_unlock; - } - cpu_clock_sample_group(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - /* Leave the tasklist_lock locked for the call below. */ - } - - /* - * Now re-arm for the new expiry time. - */ - arm_timer(timer, now); - -out_unlock: - read_unlock(&tasklist_lock); - -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; -} - -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(struct task_struct *tsk) -{ - LIST_HEAD(firing); - struct k_itimer *timer, *next; - - BUG_ON(!irqs_disabled()); - -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) - return; - -#undef UNEXPIRED - - /* - * Double-check with locks held. - */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); - - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - - /* - * Now that all the timers on our list have the firing flag, - * noone will touch their list entries but us. We'll take - * each timer's lock before clearing its firing flag, so no - * timer call will interfere. - */ - list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int firing; - spin_lock(&timer->it_lock); - list_del_init(&timer->it.cpu.entry); - firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; - /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. - */ - if (likely(firing >= 0)) { - cpu_timer_fire(timer); - } - spin_unlock(&timer->it_lock); - } -} - -/* - * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. - */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, - cputime_t *newval, cputime_t *oldval) -{ - union cpu_time_count now; - struct list_head *head; - - BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); - - if (oldval) { - if (!cputime_eq(*oldval, cputime_zero)) { - if (cputime_le(*oldval, now.cpu)) { - /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); - } else { - *oldval = cputime_sub(*oldval, now.cpu); - } - } - - if (cputime_eq(*newval, cputime_zero)) - return; - *newval = cputime_add(*newval, now.cpu); - - /* - * If the RLIMIT_CPU timer will expire before the - * ITIMER_PROF timer, we have nothing else to do. - */ - if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur - < cputime_to_secs(*newval)) - return; - } - - /* - * Check whether there are any process timers already set to fire - * before this one. If so, we don't have anything more to do. - */ - head = &tsk->signal->cpu_timers[clock_idx]; - if (list_empty(head) || - cputime_ge(list_first_entry(head, - struct cpu_timer_list, entry)->expires.cpu, - *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); - } -} - -static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) -{ - struct k_itimer timer; - int error; - - /* - * Set up a temporary timer and then wait for it to go off. - */ - memset(&timer, 0, sizeof timer); - spin_lock_init(&timer.it_lock); - timer.it_clock = which_clock; - timer.it_overrun = -1; - error = posix_cpu_timer_create(&timer); - timer.it_process = current; - if (!error) { - static struct itimerspec zero_it; - - memset(it, 0, sizeof *it); - it->it_value = *rqtp; - - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); - if (error) { - spin_unlock_irq(&timer.it_lock); - return error; - } - - while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { - /* - * Our timer fired and was reset. - */ - spin_unlock_irq(&timer.it_lock); - return 0; - } - - /* - * Block until cpu_timer_fire (or a signal) wakes us. - */ - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&timer.it_lock); - schedule(); - spin_lock_irq(&timer.it_lock); - } - - /* - * We were interrupted by a signal. - */ - sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, it); - spin_unlock_irq(&timer.it_lock); - - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { - /* - * It actually did fire already. - */ - return 0; - } - - error = -ERESTART_RESTARTBLOCK; - } - - return error; -} - -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; - struct itimerspec it; - int error; - - /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - - if (flags & TIMER_ABSTIME) - return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; - } - return error; -} - -long posix_cpu_nsleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; - struct timespec t; - struct itimerspec it; - int error; - - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; - - restart_block->fn = do_no_restart_syscall; - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; - } - return error; - -} - - -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) - -static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(PROCESS_CLOCK, tp); -} -static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(PROCESS_CLOCK, tp); -} -static int process_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = PROCESS_CLOCK; - return posix_cpu_timer_create(timer); -} -static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, - struct timespec __user *rmtp) -{ - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); -} -static long process_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} -static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(THREAD_CLOCK, tp); -} -static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(THREAD_CLOCK, tp); -} -static int thread_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = THREAD_CLOCK; - return posix_cpu_timer_create(timer); -} -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} - -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, - }; - - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - - return 0; -} -__initcall(init_posix_cpu_timers); -/* - * linux/kernel/posix-timers.c - * - * - * 2002-10-15 Posix Clocks & timers - * by George Anzinger george@mvista.com - * - * Copyright (C) 2002 2003 by MontaVista Software. - * - * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. - * Copyright (C) 2004 Boris Hu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA - */ - -/* These are all the functions necessary to implement - * POSIX clocks & timers - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to - * void idr_remove(struct idr *idp, int id); to release - * void idr_init(struct idr *idp); to initialize - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. - */ - -/* - * Lets keep our timers in a slab cache :-) - */ -static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); - -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ -#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) -#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" -#endif - - -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_process - * fields are not modified by timer code. - * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ - -static struct k_clock posix_clocks[MAX_CLOCKS]; - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, - struct timespec __user *rmtp); -static void common_timer_get(struct k_itimer *, struct itimerspec *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); - -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) -{ - spin_unlock_irqrestore(&timr->it_lock, flags); -} - -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_real_ts(tp); - return 0; -} - -static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) -{ - return do_sys_settimeofday(tp, NULL); -} - -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) -{ - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - if (posix_clocks[which_clock].res != 0) - return 0; - return 1; -} - -/* - * Get monotonic time for posix timers - */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_ts(tp); - return 0; -} - -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ -static __init int init_posix_timers(void) -{ - struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, - }; - struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, - }; - - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); - idr_init(&posix_timers_id); - return 0; -} - -__initcall(init_posix_timers); - -static void schedule_next_timer(struct k_itimer *timr) -{ - struct hrtimer *timer = &timr->it.real.timer; - - if (timr->it.real.interval.tv64 == 0) - return; - - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; - hrtimer_restart(timer); -} - -/* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect aginst the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. - */ -void do_schedule_next_timer(struct siginfo *info) -{ - struct k_itimer *timr; - unsigned long flags; - - timr = lock_timer(info->si_tid, &flags); - - if (timr && timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); - - info->si_overrun += timr->it_overrun_last; - } - - if (timr) - unlock_timer(timr, flags); -} - -int posix_timer_event(struct k_itimer *timr, int si_private) -{ - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). - * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; - - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); -} -EXPORT_SYMBOL_GPL(posix_timer_event); - -/* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. - */ -static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) -{ - struct k_itimer *timr; - unsigned long flags; - int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; - - timr = container_of(timer, struct k_itimer, it.real.timer); - spin_lock_irqsave(&timr->it_lock, flags); - - if (timr->it.real.interval.tv64 != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - if (timr->it.real.interval.tv64 != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. - * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. - */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); - - if (timr->it.real.interval.tv64 < kj.tv64) - now = ktime_add(now, kj); - } -#endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, - timr->it.real.interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - } - } - - unlock_timer(timr, flags); - return ret; -} - -static struct task_struct * good_sigevent(sigevent_t * event) -{ - struct task_struct *rtn = current->group_leader; - - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) - return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return rtn; -} - -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(register_posix_clock); - -static struct k_itimer * alloc_posix_timer(void) -{ - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); - if (!tmr) - return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { - kmem_cache_free(posix_timers_cache, tmr); - return NULL; - } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); - return tmr; -} - -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) -{ - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); - } - sigqueue_free(tmr->sigq); - kmem_cache_free(posix_timers_cache, tmr); -} - -/* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - int error = 0; - struct k_itimer *new_timer = NULL; - int new_timer_id; - struct task_struct *process = NULL; - unsigned long flags; - sigevent_t event; - int it_id_set = IT_ID_NOT_SET; - - if (invalid_clockid(which_clock)) - return -EINVAL; - - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - retry: - if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { - error = -EAGAIN; - goto out; - } - spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); - spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - error = -EAGAIN; - goto out; - } - - it_id_set = IT_ID_SET; - new_timer->it_id = (timer_t) new_timer_id; - new_timer->it_clock = which_clock; - new_timer->it_overrun = -1; - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); - if (error) - goto out; - - /* - * return the timer_id now. The next step is hard to - * back out if there is an error. - */ - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ - spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } - } - read_unlock(&tasklist_lock); - if (!process) { - error = -EINVAL; - goto out; - } - } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; - process = current->group_leader; - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } - - /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. - */ - -out: - if (error) - release_posix_timer(new_timer, it_id_set); - - return error; -} - -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) -{ - struct k_itimer *timr; - /* - * Watch out here. We do a irqsave on the idr_lock and pass the - * flags part over to the timer lock. Must not let interrupts in - * while we are moving the lock. - */ - - spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); - if (timr) { - spin_lock(&timr->it_lock); - - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else - spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); - - return timr; -} - -/* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. - * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. - * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. - */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) -{ - ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; - - memset(cur_setting, 0, sizeof(struct itimerspec)); - - iv = timr->it.real.interval; - - /* interval timer ? */ - if (iv.tv64) - cur_setting->it_interval = ktime_to_timespec(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; - - now = timer->base->get_time(); - - /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. - */ - if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - - remaining = ktime_sub(timer->expires, now); - /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) { - /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! - */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - cur_setting->it_value.tv_nsec = 1; - } else - cur_setting->it_value = ktime_to_timespec(remaining); -} - -/* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) -{ - struct k_itimer *timr; - struct itimerspec cur_setting; - unsigned long flags; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); - - unlock_timer(timr, flags); - - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; - - return 0; -} - -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is - * to pick up the frozen overrun. - */ -SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) -{ - struct k_itimer *timr; - int overrun; - unsigned long flags; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timr->it_overrun_last; - unlock_timer(timr, flags); - - return overrun; -} - -/* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) -{ - struct hrtimer *timer = &timr->it.real.timer; - enum hrtimer_mode mode; - - if (old_setting) - common_timer_get(timr, old_setting); - - /* disable the timer */ - timr->it.real.interval.tv64 = 0; - /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. - */ - if (hrtimer_try_to_cancel(timer) < 0) - return TIMER_RETRY; - - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; - - /* switch off the timer when it_value is zero */ - if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) - return 0; - - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - timer->expires = timespec_to_ktime(new_setting->it_value); - - /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); - - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - timer->expires = - ktime_add_safe(timer->expires, - timer->base->get_time()); - } - return 0; - } - - hrtimer_start(timer, timer->expires, mode); - return 0; -} - -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) -{ - struct k_itimer *timr; - struct itimerspec new_spec, old_spec; - int error = 0; - unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; - - if (!new_setting) - return -EINVAL; - - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) - return -EINVAL; -retry: - timr = lock_timer(timer_id, &flag); - if (!timr) - return -EINVAL; - - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); - - unlock_timer(timr, flag); - if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... - goto retry; - } - - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; - - return error; -} - -static inline int common_timer_del(struct k_itimer *timer) -{ - timer->it.real.interval.tv64 = 0; - - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) - return TIMER_RETRY; - return 0; -} - -static inline int timer_delete_hook(struct k_itimer *timer) -{ - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); -} - -/* Delete a POSIX.1b interval timer. */ -SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) -{ - struct k_itimer *timer; - unsigned long flags; - -retry_delete: - timer = lock_timer(timer_id, &flags); - if (!timer) - return -EINVAL; - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - - spin_lock(¤t->sighand->siglock); - list_del(&timer->list); - spin_unlock(¤t->sighand->siglock); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); - timer->it_process = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); - return 0; -} - -/* - * return timer owned by the process, used by exit_itimers - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - -retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); - timer->it_process = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); -} - -/* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. - */ -void exit_itimers(struct signal_struct *sig) -{ - struct k_itimer *tmr; - - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); - itimer_delete(tmr); - } -} - -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - -SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) -{ - struct timespec new_tp; - - if (invalid_clockid(which_clock)) - return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) - return -EFAULT; - - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); -} - -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) -{ - struct timespec kernel_tp; - int error; - - if (invalid_clockid(which_clock)) - return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) - error = -EFAULT; - - return error; - -} - -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) -{ - struct timespec rtn_tp; - int error; - - if (invalid_clockid(which_clock)) - return -EINVAL; - - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); - - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { - error = -EFAULT; - } - - return error; -} - -/* - * nanosleep for monotonic and realtime clocks - */ -static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); -} - -SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct timespec t; - - if (invalid_clockid(which_clock)) - return -EINVAL; - - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - - if (!timespec_valid(&t)) - return -EINVAL; - - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); -} - -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); -} - -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long -clock_nanosleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->arg0; - - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); -} -/* - * drivers/power/process.c - Functions for saving/restoring console. - * - * Originally from swsusp. - */ - -#include -#include -#include -#include -#include "power.h" - -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) -#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) - -static int orig_fgconsole, orig_kmsg; -static int disable_vt_switch; - -/* - * Normally during a suspend, we allocate a new console and switch to it. - * When we resume, we switch back to the original console. This switch - * can be slow, so on systems where the framebuffer can handle restoration - * of video registers anyways, there's little point in doing the console - * switch. This function allows you to disable it by passing it '0'. - */ -void pm_set_vt_switch(int do_switch) -{ - acquire_console_sem(); - disable_vt_switch = !do_switch; - release_console_sem(); -} -EXPORT_SYMBOL(pm_set_vt_switch); - -int pm_prepare_console(void) -{ - acquire_console_sem(); - - if (disable_vt_switch) { - release_console_sem(); - return 0; - } - - orig_fgconsole = fg_console; - - if (vc_allocate(SUSPEND_CONSOLE)) { - /* we can't have a free VC for now. Too bad, - * we don't want to mess the screen for now. */ - release_console_sem(); - return 1; - } - - if (set_console(SUSPEND_CONSOLE)) { - /* - * We're unable to switch to the SUSPEND_CONSOLE. - * Let the calling function know so it can decide - * what to do. - */ - release_console_sem(); - return 1; - } - release_console_sem(); - - if (vt_waitactive(SUSPEND_CONSOLE)) { - pr_debug("Suspend: Can't switch VCs."); - return 1; - } - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; - return 0; -} - -void pm_restore_console(void) -{ - acquire_console_sem(); - if (disable_vt_switch) { - release_console_sem(); - return; - } - set_console(orig_fgconsole); - release_console_sem(); - kmsg_redirect = orig_kmsg; -} -#endif -/* - * kernel/power/disk.c - Suspend-to-disk support. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - - -static int noresume = 0; -static char resume_file[256] = CONFIG_PM_STD_PARTITION; -dev_t swsusp_resume_device; -sector_t swsusp_resume_block; - -enum { - HIBERNATION_INVALID, - HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, - HIBERNATION_SHUTDOWN, - HIBERNATION_REBOOT, - /* keep last */ - __HIBERNATION_AFTER_LAST -}; -#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) -#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) - -static int hibernation_mode = HIBERNATION_SHUTDOWN; - -static struct platform_hibernation_ops *hibernation_ops; - -/** - * hibernation_set_ops - set the global hibernate operations - * @ops: the hibernation operations to use in subsequent hibernation transitions - */ - -void hibernation_set_ops(struct platform_hibernation_ops *ops) -{ - if (ops && !(ops->begin && ops->end && ops->pre_snapshot - && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { - WARN_ON(1); - return; - } - mutex_lock(&pm_mutex); - hibernation_ops = ops; - if (ops) - hibernation_mode = HIBERNATION_PLATFORM; - else if (hibernation_mode == HIBERNATION_PLATFORM) - hibernation_mode = HIBERNATION_SHUTDOWN; - - mutex_unlock(&pm_mutex); -} - -#ifdef CONFIG_PM_DEBUG -static void hibernation_debug_sleep(void) -{ - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); - mdelay(5000); -} - -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - -static int hibernation_test(int level) -{ - if (pm_test_level == level) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} -#else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } -static int hibernation_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ - -/** - * platform_begin - tell the platform driver that we're starting - * hibernation - */ - -static int platform_begin(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; -} - -/** - * platform_end - tell the platform driver that we've entered the - * working state - */ - -static void platform_end(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->end(); -} - -/** - * platform_pre_snapshot - prepare the machine for hibernation using the - * platform driver if so configured and return an error code if it fails - */ - -static int platform_pre_snapshot(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_snapshot() : 0; -} - -/** - * platform_leave - prepare the machine for switching to the normal mode - * of operation using the platform driver (called with interrupts disabled) - */ - -static void platform_leave(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->leave(); -} - -/** - * platform_finish - switch the machine to the normal mode of operation - * using the platform driver (must be called after platform_prepare()) - */ - -static void platform_finish(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->finish(); -} - -/** - * platform_pre_restore - prepare the platform for the restoration from a - * hibernation image. If the restore fails after this function has been - * called, platform_restore_cleanup() must be called. - */ - -static int platform_pre_restore(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_restore() : 0; -} - -/** - * platform_restore_cleanup - switch the platform to the normal mode of - * operation after a failing restore. If platform_pre_restore() has been - * called before the failing restore, this function must be called too, - * regardless of the result of platform_pre_restore(). - */ - -static void platform_restore_cleanup(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->restore_cleanup(); -} - -/** - * platform_recover - recover the platform from a failure to suspend - * devices. - */ - -static void platform_recover(int platform_mode) -{ - if (platform_mode && hibernation_ops && hibernation_ops->recover) - hibernation_ops->recover(); -} - -/** - * create_image - freeze devices that need to be frozen with interrupts - * off, create the hibernation image and thaw those devices. Control - * reappears in this routine after a restore. - */ - -static int create_image(int platform_mode) -{ - int error; - - error = arch_prepare_suspend(); - if (error) - return error; - - device_pm_lock(); - local_irq_disable(); - /* At this point, device_suspend() has been called, but *not* - * device_power_down(). We *must* call device_power_down() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ - error = device_power_down(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); - goto Enable_irqs; - } - - if (hibernation_test(TEST_CORE)) - goto Power_up; - - in_suspend = 1; - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); - /* Restore control flow magically appears here */ - restore_processor_state(); - if (!in_suspend) - platform_leave(platform_mode); - Power_up: - /* NOTE: device_power_up() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ - device_power_up(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Enable_irqs: - local_irq_enable(); - device_pm_unlock(); - return error; -} - -/** - * hibernation_snapshot - quiesce devices and create the hibernation - * snapshot image. - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the power transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_snapshot(int platform_mode) -{ - int error, ftrace_save; - - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - return error; - - error = platform_begin(platform_mode); - if (error) - goto Close; - - suspend_console(); - ftrace_save = __ftrace_enabled_save(); - error = device_suspend(PMSG_FREEZE); - if (error) - goto Recover_platform; - - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; - - error = platform_pre_snapshot(platform_mode); - if (error || hibernation_test(TEST_PLATFORM)) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error) { - if (hibernation_test(TEST_CPUS)) - goto Enable_cpus; - - if (hibernation_testmode(HIBERNATION_TEST)) - goto Enable_cpus; - - error = create_image(platform_mode); - /* Control returns here after successful restore */ - } - Enable_cpus: - enable_nonboot_cpus(); - Finish: - platform_finish(platform_mode); - Resume_devices: - device_resume(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); - resume_console(); - Close: - platform_end(platform_mode); - return error; - - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; -} - -/** - * resume_target_kernel - prepare devices that need to be suspended with - * interrupts off, restore the contents of highmem that have not been - * restored yet from the image and run the low level code that will restore - * the remaining contents of memory and switch to the just restored target - * kernel. - */ - -static int resume_target_kernel(void) -{ - int error; - - device_pm_lock(); - local_irq_disable(); - error = device_power_down(PMSG_QUIESCE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); - goto Enable_irqs; - } - /* We'll ignore saved state, but this gets preempt count (etc) right */ - save_processor_state(); - error = restore_highmem(); - if (!error) { - error = swsusp_arch_resume(); - /* - * The code below is only ever reached in case of a failure. - * Otherwise execution continues at place where - * swsusp_arch_suspend() was called - */ - BUG_ON(!error); - /* This call to restore_highmem() undos the previous one */ - restore_highmem(); - } - /* - * The only reason why swsusp_arch_resume() can fail is memory being - * very tight, so we have to free it as soon as we can to avoid - * subsequent failures - */ - swsusp_free(); - restore_processor_state(); - touch_softlockup_watchdog(); - device_power_up(PMSG_RECOVER); - Enable_irqs: - local_irq_enable(); - device_pm_unlock(); - return error; -} - -/** - * hibernation_restore - quiesce devices and restore the hibernation - * snapshot image. If successful, control returns in hibernation_snaphot() - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the transition. - * - * Must be called with pm_mutex held - */ - -int hibernation_restore(int platform_mode) -{ - int error, ftrace_save; - - pm_prepare_console(); - suspend_console(); - ftrace_save = __ftrace_enabled_save(); - error = device_suspend(PMSG_QUIESCE); - if (error) - goto Finish; - - error = platform_pre_restore(platform_mode); - if (!error) { - error = disable_nonboot_cpus(); - if (!error) - error = resume_target_kernel(); - enable_nonboot_cpus(); - } - platform_restore_cleanup(platform_mode); - device_resume(PMSG_RECOVER); - Finish: - __ftrace_enabled_restore(ftrace_save); - resume_console(); - pm_restore_console(); - return error; -} - -/** - * hibernation_platform_enter - enter the hibernation state using the - * platform driver (if available) - */ - -int hibernation_platform_enter(void) -{ - int error, ftrace_save; - - if (!hibernation_ops) - return -ENOSYS; - - /* - * We have cancelled the power transition by running - * hibernation_ops->finish() before saving the image, so we should let - * the firmware know that we're going to enter the sleep state after all - */ - error = hibernation_ops->begin(); - if (error) - goto Close; - - suspend_console(); - ftrace_save = __ftrace_enabled_save(); - error = device_suspend(PMSG_HIBERNATE); - if (error) { - if (hibernation_ops->recover) - hibernation_ops->recover(); - goto Resume_devices; - } - - error = hibernation_ops->prepare(); - if (error) - goto Resume_devices; - - error = disable_nonboot_cpus(); - if (error) - goto Finish; - - device_pm_lock(); - local_irq_disable(); - error = device_power_down(PMSG_HIBERNATE); - if (!error) { - hibernation_ops->enter(); - /* We should never get here */ - while (1); - } - local_irq_enable(); - device_pm_unlock(); - - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ - Finish: - hibernation_ops->finish(); - Resume_devices: - device_resume(PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); - resume_console(); - Close: - hibernation_ops->end(); - return error; -} - -/** - * power_down - Shut the machine down for hibernation. - * - * Use the platform driver, if configured so; otherwise try - * to power off or reboot. - */ - -static void power_down(void) -{ - switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_REBOOT: - kernel_restart(NULL); - break; - case HIBERNATION_PLATFORM: - hibernation_platform_enter(); - case HIBERNATION_SHUTDOWN: - kernel_power_off(); - break; - } - kernel_halt(); - /* - * Valid image is on the disk, if we continue we risk serious data - * corruption after resume. - */ - printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); -} - -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - -/** - * hibernate - The granpappy of the built-in hibernation management - */ - -int hibernate(void) -{ - int error; - - mutex_lock(&pm_mutex); - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - goto Exit; - - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = prepare_processes(); - if (error) - goto Finish; - - if (hibernation_test(TEST_FREEZER)) - goto Thaw; - - if (hibernation_testmode(HIBERNATION_TESTPROC)) - goto Thaw; - - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { - unsigned int flags = 0; - - if (hibernation_mode == HIBERNATION_PLATFORM) - flags |= SF_PLATFORM_MODE; - pr_debug("PM: writing image.\n"); - error = swsusp_write(flags); - swsusp_free(); - if (!error) - power_down(); - } else { - pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); - } - Thaw: - thaw_processes(); - Finish: - free_basic_memory_bitmaps(); - Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * software_resume - Resume from a saved image. - * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in hibernate() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. - * - */ - -static int software_resume(void) -{ - int error; - unsigned int flags; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); - if (!swsusp_resume_device) { - if (!strlen(resume_file)) { - mutex_unlock(&pm_mutex); - return -ENOENT; - } - swsusp_resume_device = name_to_dev_t(resume_file); - pr_debug("PM: Resume from partition %s\n", resume_file); - } else { - pr_debug("PM: Resume from partition %d:%d\n", - MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); - } - - if (noresume) { - /** - * FIXME: If noresume is specified, we need to find the - * partition and reset it back to normal swap space. - */ - mutex_unlock(&pm_mutex); - return 0; - } - - pr_debug("PM: Checking hibernation image.\n"); - error = swsusp_check(); - if (error) - goto Unlock; - - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - goto Finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto Finish; - - pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); - if (error) { - swsusp_close(); - goto Done; - } - - pr_debug("PM: Reading hibernation image.\n"); - - error = swsusp_read(&flags); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Restore failed, recovering.\n"); - swsusp_free(); - thaw_processes(); - Done: - free_basic_memory_bitmaps(); - Finish: - pm_notifier_call_chain(PM_POST_RESTORE); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - /* For success case, the suspend path will release the lock */ - Unlock: - mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); - return error; -} - -late_initcall(software_resume); - - -static const char * const hibernation_modes[] = { - [HIBERNATION_PLATFORM] = "platform", - [HIBERNATION_SHUTDOWN] = "shutdown", - [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", -}; - -/** - * disk - Control hibernation mode - * - * Suspend-to-disk can be handled in several ways. We have a few options - * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other hibernation_ops), powering off the system or rebooting the - * system (for testing) as well as the two test modes. - * - * The system can support 'platform', and that is known a priori (and - * encoded by the presence of hibernation_ops). However, the user may - * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the - * test modes, 'test' or 'testproc'. - * - * show() will display what the mode is currently set to. - * store() will accept one of - * - * 'platform' - * 'shutdown' - * 'reboot' - * 'test' - * 'testproc' - * - * It will only change to 'platform' if the system - * supports it (as determined by having hibernation_ops). - */ - -static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - int i; - char *start = buf; - - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!hibernation_modes[i]) - continue; - switch (i) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - break; - /* not a valid mode, continue with loop */ - continue; - } - if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); - else - buf += sprintf(buf, "%s ", hibernation_modes[i]); - } - buf += sprintf(buf, "\n"); - return buf-start; -} - - -static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int error = 0; - int i; - int len; - char *p; - int mode = HIBERNATION_INVALID; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - mutex_lock(&pm_mutex); - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (len == strlen(hibernation_modes[i]) - && !strncmp(buf, hibernation_modes[i], len)) { - mode = i; - break; - } - } - if (mode != HIBERNATION_INVALID) { - switch (mode) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - hibernation_mode = mode; - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - hibernation_mode = mode; - else - error = -EINVAL; - } - } else - error = -EINVAL; - - if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", - hibernation_modes[mode]); - mutex_unlock(&pm_mutex); - return error ? error : n; -} - -power_attr(disk); - -static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); -} - -static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; - - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; - - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; - - mutex_lock(&pm_mutex); - swsusp_resume_device = res; - mutex_unlock(&pm_mutex); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); - noresume = 0; - software_resume(); - ret = n; - out: - return ret; -} - -power_attr(resume); - -static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", image_size); -} - -static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - image_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(image_size); - -static struct attribute * g[] = { - &disk_attr.attr, - &resume_attr.attr, - &image_size_attr.attr, - NULL, -}; - - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_disk_init(void) -{ - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_disk_init); - - -static int __init resume_setup(char *str) -{ - if (noresume) - return 1; - - strncpy( resume_file, str, 255 ); - return 1; -} - -static int __init resume_offset_setup(char *str) -{ - unsigned long long offset; - - if (noresume) - return 1; - - if (sscanf(str, "%llu", &offset) == 1) - swsusp_resume_block = offset; - - return 1; -} - -static int __init noresume_setup(char *str) -{ - noresume = 1; - return 1; -} - -__setup("noresume", noresume_setup); -__setup("resume_offset=", resume_offset_setup); -__setup("resume=", resume_setup); -/* - * kernel/power/main.c - PM subsystem core functionality. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -DEFINE_MUTEX(pm_mutex); - -unsigned int pm_flags; -EXPORT_SYMBOL(pm_flags); - -#ifdef CONFIG_PM_SLEEP - -/* Routines for PM-transition notifications */ - -static BLOCKING_NOTIFIER_HEAD(pm_chain_head); - -int register_pm_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&pm_chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_pm_notifier); - -int unregister_pm_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&pm_chain_head, nb); -} -EXPORT_SYMBOL_GPL(unregister_pm_notifier); - -int pm_notifier_call_chain(unsigned long val) -{ - return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) - == NOTIFY_BAD) ? -EINVAL : 0; -} - -#ifdef CONFIG_PM_DEBUG -int pm_test_level = TEST_NONE; - -static int suspend_test(int level) -{ - if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); - return 1; - } - return 0; -} - -static const char * const pm_tests[__TEST_AFTER_LAST] = { - [TEST_NONE] = "none", - [TEST_CORE] = "core", - [TEST_CPUS] = "processors", - [TEST_PLATFORM] = "platform", - [TEST_DEVICES] = "devices", - [TEST_FREEZER] = "freezer", -}; - -static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - char *s = buf; - int level; - - for (level = TEST_FIRST; level <= TEST_MAX; level++) - if (pm_tests[level]) { - if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); - else - s += sprintf(s, "%s ", pm_tests[level]); - } - - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; - - return (s - buf); -} - -static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - const char * const *s; - int level; - char *p; - int len; - int error = -EINVAL; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - mutex_lock(&pm_mutex); - - level = TEST_FIRST; - for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { - pm_test_level = level; - error = 0; - break; - } - - mutex_unlock(&pm_mutex); - - return error ? error : n; -} - -power_attr(pm_test); -#else /* !CONFIG_PM_DEBUG */ -static inline int suspend_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ - -#endif /* CONFIG_PM_SLEEP */ - -#ifdef CONFIG_SUSPEND - -#ifdef CONFIG_PM_TEST_SUSPEND - -/* - * We test the system suspend code by setting an RTC wakealarm a short - * time in the future, then suspending. Suspending the devices won't - * normally take long ... some systems only need a few milliseconds. - * - * The time it takes is system-specific though, so when we test this - * during system bootup we allow a LOT of time. - */ -#define TEST_SUSPEND_SECONDS 5 - -static unsigned long suspend_test_start_time; - -static void suspend_test_start(void) -{ - /* FIXME Use better timebase than "jiffies", ideally a clocksource. - * What we want is a hardware counter that will work correctly even - * during the irqs-are-off stages of the suspend/resume cycle... - */ - suspend_test_start_time = jiffies; -} - -static void suspend_test_finish(const char *label) -{ - long nj = jiffies - suspend_test_start_time; - unsigned msec; - - msec = jiffies_to_msecs(abs(nj)); - pr_info("PM: %s took %d.%03d seconds\n", label, - msec / 1000, msec % 1000); - - /* Warning on suspend means the RTC alarm period needs to be - * larger -- the system was sooo slooowwww to suspend that the - * alarm (should have) fired before the system went to sleep! - * - * Warning on either suspend or resume also means the system - * has some performance issues. The stack dump of a WARN_ON - * is more likely to get the right attention than a printk... - */ - WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); -} - -#else - -static void suspend_test_start(void) -{ -} - -static void suspend_test_finish(const char *label) -{ -} - -#endif - -/* This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) - -static struct platform_suspend_ops *suspend_ops; - -/** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. - */ - -void suspend_set_ops(struct platform_suspend_ops *ops) -{ - mutex_lock(&pm_mutex); - suspend_ops = ops; - mutex_unlock(&pm_mutex); -} - -/** - * suspend_valid_only_mem - generic memory-only valid callback - * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. - */ -int suspend_valid_only_mem(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} - -/** - * suspend_prepare - Do prep work before entering low-power state. - * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) -{ - int error; - unsigned int free_pages; - - if (!suspend_ops || !suspend_ops->enter) - return -EPERM; - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) - goto Finish; - - if (suspend_freeze_processes()) { - error = -EAGAIN; - goto Thaw; - } - - free_pages = global_page_state(NR_FREE_PAGES); - if (free_pages < FREE_PAGE_NUMBER) { - pr_debug("PM: free some memory\n"); - shrink_all_memory(FREE_PAGE_NUMBER - free_pages); - if (nr_free_pages() < FREE_PAGE_NUMBER) { - error = -ENOMEM; - printk(KERN_ERR "PM: No enough memory\n"); - } - } - if (!error) - return 0; - - Thaw: - suspend_thaw_processes(); - Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - return error; -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ - local_irq_disable(); -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) -{ - local_irq_enable(); -} - -/** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter - * - * This function should be called after devices have been suspended. - */ -static int suspend_enter(suspend_state_t state) -{ - int error = 0; - - device_pm_lock(); - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - - if ((error = device_power_down(PMSG_SUSPEND))) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Done; - } - - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); - - device_power_up(PMSG_RESUME); - Done: - arch_suspend_enable_irqs(); - BUG_ON(irqs_disabled()); - device_pm_unlock(); - return error; -} - -/** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) -{ - int error, ftrace_save; - - if (!suspend_ops) - return -ENOSYS; - - if (suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - ftrace_save = __ftrace_enabled_save(); - suspend_test_start(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Recover_platform; - } - suspend_test_finish("suspend devices"); - if (suspend_test(TEST_DEVICES)) - goto Recover_platform; - - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Resume_devices; - } - - if (suspend_test(TEST_PLATFORM)) - goto Finish; - - error = disable_nonboot_cpus(); - if (!error && !suspend_test(TEST_CPUS)) - suspend_enter(state); - - enable_nonboot_cpus(); - Finish: - if (suspend_ops->finish) - suspend_ops->finish(); - Resume_devices: - suspend_test_start(); - device_resume(PMSG_RESUME); - suspend_test_finish("resume devices"); - __ftrace_enabled_restore(ftrace_save); - resume_console(); - Close: - if (suspend_ops->end) - suspend_ops->end(); - return error; - - Recover_platform: - if (suspend_ops->recover) - suspend_ops->recover(); - goto Resume_devices; -} - -/** - * suspend_finish - Do final work before exiting suspend sequence. - * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) -{ - suspend_thaw_processes(); - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); -} - - - - -static const char * const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -}; - -static inline int valid_state(suspend_state_t state) -{ - /* All states need lowlevel support and need to be valid - * to the lowlevel implementation, no valid callback - * implies that none are valid. */ - if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state)) - return 0; - return 1; -} - - -/** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. - * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). - */ -static int enter_state(suspend_state_t state) -{ - int error; - - if (!valid_state(state)) - return -ENODEV; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); - if (error) - goto Unlock; - - if (suspend_test(TEST_FREEZER)) - goto Finish; - - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_devices_and_enter(state); - - Finish: - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - - -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. - * - * Determine whether or not value is within range, get state - * structure, and enter (above). - */ - -int pm_suspend(suspend_state_t state) -{ - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); - return -EINVAL; -} - -EXPORT_SYMBOL(pm_suspend); - -#endif /* CONFIG_SUSPEND */ - -struct kobject *power_kobj; - -/** - * state - control system power state. - * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). - * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. - */ - -static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - char *s = buf; -#ifdef CONFIG_SUSPEND - int i; - - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } -#endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; -#endif - return (s - buf); -} - -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ -#ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; - const char * const *s; -#endif - char *p; - int len; - int error = -EINVAL; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } - -#ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) - break; - } - if (state < PM_SUSPEND_MAX && *s) - error = enter_state(state); -#endif - - Exit: - return error ? error : n; -} - -power_attr(state); - -#ifdef CONFIG_PM_TRACE -int pm_trace_enabled; - -static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", pm_trace_enabled); -} - -static ssize_t -pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int val; - - if (sscanf(buf, "%d", &val) == 1) { - pm_trace_enabled = !!val; - return n; - } - return -EINVAL; -} - -power_attr(pm_trace); -#endif /* CONFIG_PM_TRACE */ - -static struct attribute * g[] = { - &state_attr.attr, -#ifdef CONFIG_PM_TRACE - &pm_trace_attr.attr, -#endif -#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) - &pm_test_attr.attr, -#endif - NULL, -}; - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_init(void) -{ - power_kobj = kobject_create_and_add("power", NULL); - if (!power_kobj) - return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_init); - - -#ifdef CONFIG_PM_TEST_SUSPEND - -#include - -/* - * To test system suspend, we need a hands-off mechanism to resume the - * system. RTCs wake alarms are a common self-contained mechanism. - */ - -static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) -{ - static char err_readtime[] __initdata = - KERN_ERR "PM: can't read %s time, err %d\n"; - static char err_wakealarm [] __initdata = - KERN_ERR "PM: can't set %s wakealarm, err %d\n"; - static char err_suspend[] __initdata = - KERN_ERR "PM: suspend test failed, error %d\n"; - static char info_test[] __initdata = - KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - - unsigned long now; - struct rtc_wkalrm alm; - int status; - - /* this may fail if the RTC hasn't been initialized */ - status = rtc_read_time(rtc, &alm.time); - if (status < 0) { - printk(err_readtime, rtc->dev.bus_id, status); - return; - } - rtc_tm_to_time(&alm.time, &now); - - memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); - alm.enabled = true; - - status = rtc_set_alarm(rtc, &alm); - if (status < 0) { - printk(err_wakealarm, rtc->dev.bus_id, status); - return; - } - - if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - if (status == -ENODEV) - state = PM_SUSPEND_STANDBY; - } - if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - } - if (status < 0) - printk(err_suspend, status); - - /* Some platforms can't detect that the alarm triggered the - * wakeup, or (accordingly) disable it after it afterwards. - * It's supposed to give oneshot behavior; cope. - */ - alm.enabled = false; - rtc_set_alarm(rtc, &alm); -} - -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(char **)name_ptr = dev->bus_id; - return 1; -} - -/* - * Kernel options like "test_suspend=mem" force suspend/resume sanity tests - * at startup time. They're normally disabled, for faster boot and because - * we can't know which states really work on this particular system. - */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; - -static char warn_bad_state[] __initdata = - KERN_WARNING "PM: can't test '%s' suspend state\n"; - -static int __init setup_test_suspend(char *value) -{ - unsigned i; - - /* "=mem" ==> "mem" */ - value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } - printk(warn_bad_state, value); - return 0; -} -__setup("test_suspend", setup_test_suspend); - -static int __init test_suspend(void) -{ - static char warn_no_rtc[] __initdata = - KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - - char *pony = NULL; - struct rtc_device *rtc = NULL; - - /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); - goto done; - } - - /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); - if (!rtc) { - printk(warn_no_rtc); - goto done; - } - - /* go for it */ - test_wakealarm(rtc, test_state); - rtc_class_close(rtc); -done: - return 0; -} -late_initcall(test_suspend); - -#endif /* CONFIG_PM_TEST_SUSPEND */ -/* - * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * When the user hits Sys-Rq o to power down the machine this is the - * callback we use. - */ - -static void do_poweroff(struct work_struct *dummy) -{ - kernel_power_off(); -} - -static DECLARE_WORK(poweroff_work, do_poweroff); - -static void handle_poweroff(int key, struct tty_struct *tty) -{ - /* run sysrq poweroff on boot cpu */ - schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); -} - -static struct sysrq_key_op sysrq_poweroff_op = { - .handler = handle_poweroff, - .help_msg = "powerOff", - .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, -}; - -static int pm_sysrq_init(void) -{ - register_sysrq_key('o', &sysrq_poweroff_op); - return 0; -} - -subsys_initcall(pm_sysrq_init); -/* - * drivers/power/process.c - Functions for starting/stopping processes on - * suspend transitions. - * - * Originally from swsusp. - */ - - -#undef DEBUG - -#include -#include -#include -#include -#include - -/* - * Timeout for stopping processes - */ -#define TIMEOUT (20 * HZ) - -static inline int freezeable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) - return 0; - return 1; -} - -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } - - if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else if (sig_only) { - return false; - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - return true; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - -static int try_to_freeze_tasks(bool sig_only) -{ - struct task_struct *g, *p; - unsigned long end_time; - unsigned int todo; - struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; - - do_gettimeofday(&start); - - end_time = jiffies + TIMEOUT; - do { - todo = 0; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (frozen(p) || !freezeable(p)) - continue; - - if (!freeze_task(p, sig_only)) - continue; - - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) - todo++; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (time_after(jiffies, end_time)) - break; - } while (todo); - - do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; - - if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ - printk("\n"); - printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); - show_state(); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - if (freezing(p) && !freezer_should_skip(p)) - printk(KERN_ERR " %s\n", p->comm); - cancel_freezing(p); - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); - } - - return todo ? -EBUSY : 0; -} - -/** - * freeze_processes - tell processes to enter the refrigerator - */ -int freeze_processes(void) -{ - int error; - - printk("Freezing user space processes ... "); - error = try_to_freeze_tasks(true); - if (error) - goto Exit; - printk("done.\n"); - - printk("Freezing remaining freezable tasks ... "); - error = try_to_freeze_tasks(false); - if (error) - goto Exit; - printk("done."); - Exit: - BUG_ON(in_atomic()); - printk("\n"); - return error; -} - -static void thaw_tasks(bool nosig_only) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezeable(p)) - continue; - - if (nosig_only && should_send_signal(p)) - continue; - - thaw_process(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); -} - -void thaw_processes(void) -{ - printk("Restarting tasks ... "); - thaw_tasks(true); - thaw_tasks(false); - schedule(); - printk("done.\n"); -} - -EXPORT_SYMBOL(refrigerator); -/* - * linux/kernel/power/snapshot.c - * - * This file provides system snapshot/restore functionality for swsusp. - * - * Copyright (C) 1998-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "power.h" - -static int swsusp_page_is_free(struct page *); -static void swsusp_set_page_forbidden(struct page *); -static void swsusp_unset_page_forbidden(struct page *); - -/* List of PBEs needed for restoring the pages that were allocated before - * the suspend and included in the suspend image, but have also been - * allocated by the "resume" kernel, so their contents cannot be written - * directly to their "original" page frames. - */ -struct pbe *restore_pblist; - -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; - -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. - */ - -#define PG_ANY 0 -#define PG_SAFE 1 -#define PG_UNSAFE_CLEAR 1 -#define PG_UNSAFE_KEEP 0 - -static unsigned int allocated_unsafe_pages; - -static void *get_image_page(gfp_t gfp_mask, int safe_needed) -{ - void *res; - - res = (void *)get_zeroed_page(gfp_mask); - if (safe_needed) - while (res && swsusp_page_is_free(virt_to_page(res))) { - /* The page is unsafe, mark it for swsusp_free() */ - swsusp_set_page_forbidden(virt_to_page(res)); - allocated_unsafe_pages++; - res = (void *)get_zeroed_page(gfp_mask); - } - if (res) { - swsusp_set_page_forbidden(virt_to_page(res)); - swsusp_set_page_free(virt_to_page(res)); - } - return res; -} - -unsigned long get_safe_page(gfp_t gfp_mask) -{ - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); -} - -static struct page *alloc_image_page(gfp_t gfp_mask) -{ - struct page *page; - - page = alloc_page(gfp_mask); - if (page) { - swsusp_set_page_forbidden(page); - swsusp_set_page_free(page); - } - return page; -} - -/** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) - */ - -static inline void free_image_page(void *addr, int clear_nosave_free) -{ - struct page *page; - - BUG_ON(!virt_addr_valid(addr)); - - page = virt_to_page(addr); - - swsusp_unset_page_forbidden(page); - if (clear_nosave_free) - swsusp_unset_page_free(page); - - __free_page(page); -} - -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) -{ - while (list) { - struct linked_page *lp = list->next; - - free_image_page(list, clear_page_nosave); - list = lp; - } -} - -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - -struct chain_allocator { - struct linked_page *chain; /* the chain */ - unsigned int used_space; /* total size of objects allocated out - * of the current page - */ - gfp_t gfp_mask; /* mask for allocating pages */ - int safe_needed; /* if set, only "safe" pages are allocated */ -}; - -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) -{ - ca->chain = NULL; - ca->used_space = LINKED_PAGE_DATA_SIZE; - ca->gfp_mask = gfp_mask; - ca->safe_needed = safe_needed; -} - -static void *chain_alloc(struct chain_allocator *ca, unsigned int size) -{ - void *ret; - - if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { - struct linked_page *lp; - - lp = get_image_page(ca->gfp_mask, ca->safe_needed); - if (!lp) - return NULL; - - lp->next = ca->chain; - ca->chain = lp; - ca->used_space = 0; - } - ret = ca->chain->data + ca->used_space; - ca->used_space += size; - return ret; -} - -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - -/** - * Data types related to memory bitmaps. - * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. - * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. - * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). - * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. - * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. - */ - -#define BM_END_OF_MAP (~0UL) - -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) - -struct bm_block { - struct bm_block *next; /* next element of the list */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ -}; - -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} - -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - -/* strcut bm_position is used for browsing memory bitmaps */ - -struct bm_position { - struct zone_bitmap *zone_bm; - struct bm_block *block; - int bit; -}; - -struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ - struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ - struct bm_position cur; /* most recently used bit position */ -}; - -/* Functions that operate on memory bitmaps */ - -static void memory_bm_position_reset(struct memory_bitmap *bm) -{ - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; -} - -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); - -/** - * create_bm_block_list - create a list of block bitmap objects - */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) -{ - struct bm_block *bblist = NULL; - - while (nr_blocks-- > 0) { - struct bm_block *bb; - - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; - } - return bblist; -} - -/** - * create_zone_bm_list - create a list of zone bitmap objects - */ - -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) -{ - struct zone_bitmap *zbmlist = NULL; - - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; - - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; - - zbm->next = zbmlist; - zbmlist = zbm; - } - return zbmlist; -} - -/** - * memory_bm_create - allocate memory for a memory bitmap - */ - -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) -{ - struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; - - chain_init(&ca, gfp_mask, safe_needed); - - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; - - if (!populated_zone(zone)) - continue; - - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; - - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; - - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; - - bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += nr; - } - bb->end_pfn = pfn; - bb = bb->next; - } - zone_bm = zone_bm->next; - } - bm->p_list = ca.chain; - memory_bm_position_reset(bm); - return 0; - - Free: - bm->p_list = ca.chain; - memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; -} - -/** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ - -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) -{ - struct zone_bitmap *zone_bm; - - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; - - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } - free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; -} - -/** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. - */ - -static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) -{ - struct zone_bitmap *zone_bm; - struct bm_block *bb; - - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; - if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; - - while (pfn >= bb->end_pfn) { - bb = bb->next; - - BUG_ON(!bb); - } - zone_bm->cur_block = bb; - pfn -= bb->start_pfn; - *bit_nr = pfn; - *addr = bb->data; - return 0; -} - -static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - set_bit(bit, addr); -} - -static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - if (!error) - set_bit(bit, addr); - return error; -} - -static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - clear_bit(bit, addr); -} - -static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - return test_bit(bit, addr); -} - -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. - * - * It is required to run memory_bm_position_reset() before the first call to - * this function. - */ - -static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) -{ - struct zone_bitmap *zone_bm; - struct bm_block *bb; - int bit; - - do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); - memory_bm_position_reset(bm); - return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; -} - -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. - */ - -struct nosave_region { - struct list_head list; - unsigned long start_pfn; - unsigned long end_pfn; -}; - -static LIST_HEAD(nosave_regions); - -/** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) - */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) -{ - struct nosave_region *region; - - if (start_pfn >= end_pfn) - return; - - if (!list_empty(&nosave_regions)) { - /* Try to extend the previous region (they should be sorted) */ - region = list_entry(nosave_regions.prev, - struct nosave_region, list); - if (region->end_pfn == start_pfn) { - region->end_pfn = end_pfn; - goto Report; - } - } - if (use_kmalloc) { - /* during init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else - /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); - region->start_pfn = start_pfn; - region->end_pfn = end_pfn; - list_add_tail(®ion->list, &nosave_regions); - Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); -} - -/* - * Set bits in this map correspond to the page frames the contents of which - * should not be saved during the suspend. - */ -static struct memory_bitmap *forbidden_pages_map; - -/* Set bits in this map correspond to free page frames. */ -static struct memory_bitmap *free_pages_map; - -/* - * Each page frame allocated for creating the image is marked by setting the - * corresponding bits in forbidden_pages_map and free_pages_map simultaneously - */ - -void swsusp_set_page_free(struct page *page) -{ - if (free_pages_map) - memory_bm_set_bit(free_pages_map, page_to_pfn(page)); -} - -static int swsusp_page_is_free(struct page *page) -{ - return free_pages_map ? - memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; -} - -void swsusp_unset_page_free(struct page *page) -{ - if (free_pages_map) - memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); -} - -static void swsusp_set_page_forbidden(struct page *page) -{ - if (forbidden_pages_map) - memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); -} - -int swsusp_page_is_forbidden(struct page *page) -{ - return forbidden_pages_map ? - memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; -} - -static void swsusp_unset_page_forbidden(struct page *page) -{ - if (forbidden_pages_map) - memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); -} - -/** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. - */ - -static void mark_nosave_pages(struct memory_bitmap *bm) -{ - struct nosave_region *region; - - if (list_empty(&nosave_regions)) - return; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } - } -} - -/** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. - */ - -int create_basic_memory_bitmaps(void) -{ - struct memory_bitmap *bm1, *bm2; - int error = 0; - - BUG_ON(forbidden_pages_map || free_pages_map); - - bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm1) - return -ENOMEM; - - error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); - if (error) - goto Free_first_object; - - bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm2) - goto Free_first_bitmap; - - error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); - if (error) - goto Free_second_object; - - forbidden_pages_map = bm1; - free_pages_map = bm2; - mark_nosave_pages(forbidden_pages_map); - - pr_debug("PM: Basic memory bitmaps created\n"); - - return 0; - - Free_second_object: - kfree(bm2); - Free_first_bitmap: - memory_bm_free(bm1, PG_UNSAFE_CLEAR); - Free_first_object: - kfree(bm1); - return -ENOMEM; -} - -/** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. - */ - -void free_basic_memory_bitmaps(void) -{ - struct memory_bitmap *bm1, *bm2; - - BUG_ON(!(forbidden_pages_map && free_pages_map)); - - bm1 = forbidden_pages_map; - bm2 = free_pages_map; - forbidden_pages_map = NULL; - free_pages_map = NULL; - memory_bm_free(bm1, PG_UNSAFE_CLEAR); - kfree(bm1); - memory_bm_free(bm2, PG_UNSAFE_CLEAR); - kfree(bm2); - - pr_debug("PM: Basic memory bitmaps freed\n"); -} - -/** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) - */ - -unsigned int snapshot_additional_pages(struct zone *zone) -{ - unsigned int res; - - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); - return 2 * res; -} - -#ifdef CONFIG_HIGHMEM -/** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. - */ - -static unsigned int count_free_highmem_pages(void) -{ - struct zone *zone; - unsigned int cnt = 0; - - for_each_zone(zone) - if (populated_zone(zone) && is_highmem(zone)) - cnt += zone_page_state(zone, NR_FREE_PAGES); - - return cnt; -} - -/** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. - * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. - */ - -static struct page *saveable_highmem_page(unsigned long pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - - BUG_ON(!PageHighMem(page)); - - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) - return NULL; - - return page; -} - -/** - * count_highmem_pages - compute the total number of saveable highmem - * pages. - */ - -unsigned int count_highmem_pages(void) -{ - struct zone *zone; - unsigned int n = 0; - - for_each_zone(zone) { - unsigned long pfn, max_zone_pfn; - - if (!is_highmem(zone)) - continue; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) - n++; - } - return n; -} -#else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } -#endif /* CONFIG_HIGHMEM */ - -/** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. - * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. - */ - -static struct page *saveable_page(unsigned long pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - - BUG_ON(PageHighMem(page)); - - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) - return NULL; - - if (PageReserved(page) - && (!kernel_page_present(page) || pfn_is_nosave(pfn))) - return NULL; - - return page; -} - -/** - * count_data_pages - compute the total number of saveable non-highmem - * pages. - */ - -unsigned int count_data_pages(void) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - unsigned int n = 0; - - for_each_zone(zone) { - if (is_highmem(zone)) - continue; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) - n++; - } - return n; -} - -/* This is needed, because copy_page and memcpy are not usable for copying - * task structs. - */ -static inline void do_copy_page(long *dst, long *src) -{ - int n; - - for (n = PAGE_SIZE / sizeof(long); n; n--) - *dst++ = *src++; -} - - -/** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). - */ -static void safe_copy_page(void *dst, struct page *s_page) -{ - if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); - } else { - kernel_map_pages(s_page, 1, 1); - do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); - } -} - - -#ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) -{ - return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); -} - -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) -{ - struct page *s_page, *d_page; - void *src, *dst; - - s_page = pfn_to_page(src_pfn); - d_page = pfn_to_page(dst_pfn); - if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); - do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); - kunmap_atomic(dst, KM_USER1); - } else { - if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel - * data modified by kmap_atomic() - */ - safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); - kunmap_atomic(dst, KM_USER0); - } else { - safe_copy_page(page_address(d_page), s_page); - } - } -} -#else -#define page_is_saveable(zone, pfn) saveable_page(pfn) - -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) -{ - safe_copy_page(page_address(pfn_to_page(dst_pfn)), - pfn_to_page(src_pfn)); -} -#endif /* CONFIG_HIGHMEM */ - -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) -{ - struct zone *zone; - unsigned long pfn; - - for_each_zone(zone) { - unsigned long max_zone_pfn; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (page_is_saveable(zone, pfn)) - memory_bm_set_bit(orig_bm, pfn); - } - memory_bm_position_reset(orig_bm); - memory_bm_position_reset(copy_bm); - for(;;) { - pfn = memory_bm_next_pfn(orig_bm); - if (unlikely(pfn == BM_END_OF_MAP)) - break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); - } -} - -/* Total number of image pages */ -static unsigned int nr_copy_pages; -/* Number of pages needed for saving the original pfns of the image pages */ -static unsigned int nr_meta_pages; - -/** - * swsusp_free - free pages allocated for the suspend. - * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. - */ - -void swsusp_free(void) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - - for_each_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } - } - nr_copy_pages = 0; - nr_meta_pages = 0; - restore_pblist = NULL; - buffer = NULL; -} - -#ifdef CONFIG_HIGHMEM -/** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - -static unsigned int count_pages_for_highmem(unsigned int nr_highmem) -{ - unsigned int free_highmem = count_free_highmem_pages(); - - if (free_highmem >= nr_highmem) - nr_highmem = 0; - else - nr_highmem -= free_highmem; - - return nr_highmem; -} -#else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } -#endif /* CONFIG_HIGHMEM */ - -/** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. - */ - -static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) -{ - struct zone *zone; - unsigned int free = 0, meta = 0; - - for_each_zone(zone) { - meta += snapshot_additional_pages(zone); - if (!is_highmem(zone)) - free += zone_page_state(zone, NR_FREE_PAGES); - } - - nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); - - return free > nr_pages + PAGES_FOR_IO + meta; -} - -#ifdef CONFIG_HIGHMEM -/** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. - */ - -static inline int get_highmem_buffer(int safe_needed) -{ - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); - return buffer ? 0 : -ENOMEM; -} - -/** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. - */ - -static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) -{ - unsigned int to_alloc = count_free_highmem_pages(); - - if (to_alloc > nr_highmem) - to_alloc = nr_highmem; - - nr_highmem -= to_alloc; - while (to_alloc-- > 0) { - struct page *page; - - page = alloc_image_page(__GFP_HIGHMEM); - memory_bm_set_bit(bm, page_to_pfn(page)); - } - return nr_highmem; -} -#else -static inline int get_highmem_buffer(int safe_needed) { return 0; } - -static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } -#endif /* CONFIG_HIGHMEM */ - -/** - * swsusp_alloc - allocate memory for the suspend image - * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. - * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. - */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) -{ - int error; - - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - if (nr_highmem > 0) { - error = get_highmem_buffer(PG_ANY); - if (error) - goto Free; - - nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); - } - while (nr_pages-- > 0) { - struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - - if (!page) - goto Free; - - memory_bm_set_bit(copy_bm, page_to_pfn(page)); - } - return 0; - - Free: - swsusp_free(); - return -ENOMEM; -} - -/* Memory bitmap used for marking saveable pages (during suspend) or the - * suspend image pages (during resume) - */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used on suspend for marking allocated pages that will contain - * the copies of saveable pages. During resume it is initially used for - * marking the suspend image pages, but then its set bits are duplicated in - * @orig_bm and it is released. Next, on systems with high memory, it may be - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - -asmlinkage int swsusp_save(void) -{ - unsigned int nr_pages, nr_highmem; - - printk(KERN_INFO "PM: Creating hibernation image: \n"); - - drain_local_pages(NULL); - nr_pages = count_data_pages(); - nr_highmem = count_highmem_pages(); - printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); - - if (!enough_free_mem(nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Not enough free memory\n"); - return -ENOMEM; - } - - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Memory allocation failed\n"); - return -ENOMEM; - } - - /* During allocating of suspend pagedir, new cold pages may appear. - * Kill them. - */ - drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); - - /* - * End of critical section. From now on, we can write to memory, - * but we should not touch disk. This specially means we must _not_ - * touch swap space! Except we must write out our image of course. - */ - - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; - nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - - printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", - nr_pages); - - return 0; -} - -#ifndef CONFIG_ARCH_HIBERNATION_HEADER -static int init_header_complete(struct swsusp_info *info) -{ - memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); - info->version_code = LINUX_VERSION_CODE; - return 0; -} - -static char *check_image_kernel(struct swsusp_info *info) -{ - if (info->version_code != LINUX_VERSION_CODE) - return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) - return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) - return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) - return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) - return "machine"; - return NULL; -} -#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ - -unsigned long snapshot_get_image_size(void) -{ - return nr_copy_pages + nr_meta_pages + 1; -} - -static int init_header(struct swsusp_info *info) -{ - memset(info, 0, sizeof(struct swsusp_info)); - info->num_physpages = num_physpages; - info->image_pages = nr_copy_pages; - info->pages = snapshot_get_image_size(); - info->size = info->pages; - info->size <<= PAGE_SHIFT; - return init_header_complete(info); -} - -/** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) - */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - buf[j] = memory_bm_next_pfn(bm); - if (unlikely(buf[j] == BM_END_OF_MAP)) - break; - } -} - -/** - * snapshot_read_next - used for reading the system memory snapshot. - * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. - * - * The @count parameter should contain the number of bytes the caller - * wants to read from the snapshot. It must not be zero. - * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the read would - * cross a page boundary otherwise. - * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. - */ - -int snapshot_read_next(struct snapshot_handle *handle, size_t count) -{ - if (handle->cur > nr_meta_pages + nr_copy_pages) - return 0; - - if (!buffer) { - /* This makes the buffer be freed by swsusp_free() */ - buffer = get_image_page(GFP_ATOMIC, PG_ANY); - if (!buffer) - return -ENOMEM; - } - if (!handle->offset) { - int error; - - error = init_header((struct swsusp_info *)buffer); - if (error) - return error; - handle->buffer = buffer; - memory_bm_position_reset(&orig_bm); - memory_bm_position_reset(©_bm); - } - if (handle->prev < handle->cur) { - if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); - pack_pfns(buffer, &orig_bm); - } else { - struct page *page; - - page = pfn_to_page(memory_bm_next_pfn(©_bm)); - if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, - * because we can't return with a kmapped - * highmem page (we may not be called again). - */ - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - handle->buffer = buffer; - } else { - handle->buffer = page_address(page); - } - } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; - } - handle->offset += count; - return count; -} - -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - - /* Clear page flags */ - for_each_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); - } - - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); - - allocated_unsafe_pages = 0; - - return 0; -} - -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) -{ - unsigned long pfn; - - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); - while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); - } -} - -static int check_header(struct swsusp_info *info) -{ - char *reason; - - reason = check_image_kernel(info); - if (!reason && info->num_physpages != num_physpages) - reason = "memory size"; - if (reason) { - printk(KERN_ERR "PM: Image mismatch: %s\n", reason); - return -EPERM; - } - return 0; -} - -/** - * load header - check the image header and copy data from it - */ - -static int -load_header(struct swsusp_info *info) -{ - int error; - - restore_pblist = NULL; - error = check_header(info); - if (!error) { - nr_copy_pages = info->image_pages; - nr_meta_pages = info->pages - info->image_pages - 1; - } - return error; -} - -/** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm - */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - if (unlikely(buf[j] == BM_END_OF_MAP)) - break; - - memory_bm_set_bit(bm, buf[j]); - } -} - -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - -#ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that - * should be restored atomically during the resume from disk, because the page - * frames they have occupied before the suspend are in use. - */ -struct highmem_pbe { - struct page *copy_page; /* data is here now */ - struct page *orig_page; /* data was here before the suspend */ - struct highmem_pbe *next; -}; - -/* List of highmem PBEs needed for restoring the highmem pages that were - * allocated before the suspend and included in the suspend image, but have - * also been allocated by the "resume" kernel, so their contents cannot be - * written directly to their "original" page frames. - */ -static struct highmem_pbe *highmem_pblist; - -/** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. - */ - -static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) -{ - unsigned long pfn; - unsigned int cnt = 0; - - memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); - while (pfn != BM_END_OF_MAP) { - if (PageHighMem(pfn_to_page(pfn))) - cnt++; - - pfn = memory_bm_next_pfn(bm); - } - return cnt; -} - -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - -static unsigned int safe_highmem_pages; - -static struct memory_bitmap *safe_highmem_bm; - -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - unsigned int to_alloc; - - if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) - return -ENOMEM; - - if (get_highmem_buffer(PG_SAFE)) - return -ENOMEM; - - to_alloc = count_free_highmem_pages(); - if (to_alloc > *nr_highmem_p) - to_alloc = *nr_highmem_p; - else - *nr_highmem_p = to_alloc; - - safe_highmem_pages = 0; - while (to_alloc-- > 0) { - struct page *page; - - page = alloc_page(__GFP_HIGHMEM); - if (!swsusp_page_is_free(page)) { - /* The page is "safe", set its bit the bitmap */ - memory_bm_set_bit(bm, page_to_pfn(page)); - safe_highmem_pages++; - } - /* Mark the page as allocated */ - swsusp_set_page_forbidden(page); - swsusp_set_page_free(page); - } - memory_bm_position_reset(bm); - safe_highmem_bm = bm; - return 0; -} - -/** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. - * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. - * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. - */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) -{ - struct highmem_pbe *pbe; - void *kaddr; - - if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can - * use it directly to store the loaded page. - */ - last_highmem_page = page; - return buffer; - } - /* The "original" page frame has not been allocated and we have to - * use a "safe" page frame to store the loaded page. - */ - pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); - if (!pbe) { - swsusp_free(); - return NULL; - } - pbe->orig_page = page; - if (safe_highmem_pages > 0) { - struct page *tmp; - - /* Copy of the page will be stored in high memory */ - kaddr = buffer; - tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); - safe_highmem_pages--; - last_highmem_page = tmp; - pbe->copy_page = tmp; - } else { - /* Copy of the page will be stored in normal memory */ - kaddr = safe_pages_list; - safe_pages_list = safe_pages_list->next; - pbe->copy_page = virt_to_page(kaddr); - } - pbe->next = highmem_pblist; - highmem_pblist = pbe; - return kaddr; -} - -/** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . - */ - -static void copy_last_highmem_page(void) -{ - if (last_highmem_page) { - void *dst; - - dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); - kunmap_atomic(dst, KM_USER0); - last_highmem_page = NULL; - } -} - -static inline int last_highmem_page_copied(void) -{ - return !last_highmem_page; -} - -static inline void free_highmem_data(void) -{ - if (safe_highmem_bm) - memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); - - if (buffer) - free_image_page(buffer, PG_UNSAFE_CLEAR); -} -#else -static inline int get_safe_write_buffer(void) { return 0; } - -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } - -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} - -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) -{ - return NULL; -} - -static inline void copy_last_highmem_page(void) {} -static inline int last_highmem_page_copied(void) { return 1; } -static inline void free_highmem_data(void) {} -#endif /* CONFIG_HIGHMEM */ - -/** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. - * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. - */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) -{ - unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; - int error; - - /* If there is no highmem, the buffer will not be necessary */ - free_image_page(buffer, PG_UNSAFE_CLEAR); - buffer = NULL; - - nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; - - error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); - if (error) - goto Free; - - duplicate_memory_bitmap(new_bm, bm); - memory_bm_free(bm, PG_UNSAFE_KEEP); - if (nr_highmem > 0) { - error = prepare_highmem_image(bm, &nr_highmem); - if (error) - goto Free; - } - /* Reserve some safe pages for potential later use. - * - * NOTE: This way we make sure there will be enough safe pages for the - * chain_alloc() in get_buffer(). It is a bit wasteful, but - * nr_copy_pages cannot be greater than 50% of the memory anyway. - */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; - nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); - while (nr_pages > 0) { - lp = get_image_page(GFP_ATOMIC, PG_SAFE); - if (!lp) { - error = -ENOMEM; - goto Free; - } - lp->next = sp_list; - sp_list = lp; - nr_pages--; - } - /* Preallocate memory for the image */ - safe_pages_list = NULL; - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; - while (nr_pages > 0) { - lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); - if (!lp) { - error = -ENOMEM; - goto Free; - } - if (!swsusp_page_is_free(virt_to_page(lp))) { - /* The page is "safe", add it to the list */ - lp->next = safe_pages_list; - safe_pages_list = lp; - } - /* Mark the page as allocated */ - swsusp_set_page_forbidden(virt_to_page(lp)); - swsusp_set_page_free(virt_to_page(lp)); - nr_pages--; - } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } - return 0; - - Free: - swsusp_free(); - return error; -} - -/** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. - */ - -static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) -{ - struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); - - if (PageHighMem(page)) - return get_highmem_page_buffer(page, ca); - - if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can - * use it directly to store the loaded page. - */ - return page_address(page); - - /* The "original" page frame has not been allocated and we have to - * use a "safe" page frame to store the loaded page. - */ - pbe = chain_alloc(ca, sizeof(struct pbe)); - if (!pbe) { - swsusp_free(); - return NULL; - } - pbe->orig_address = page_address(page); - pbe->address = safe_pages_list; - safe_pages_list = safe_pages_list->next; - pbe->next = restore_pblist; - restore_pblist = pbe; - return pbe->address; -} - -/** - * snapshot_write_next - used for writing the system memory snapshot. - * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. - * - * The @count parameter should contain the number of bytes the caller - * wants to write to the image. It must not be zero. - * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the write would - * cross a page boundary otherwise. - * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. - */ - -int snapshot_write_next(struct snapshot_handle *handle, size_t count) -{ - static struct chain_allocator ca; - int error = 0; - - /* Check if we have already loaded the entire image */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) - return 0; - - if (handle->offset == 0) { - if (!buffer) - /* This makes the buffer be freed by swsusp_free() */ - buffer = get_image_page(GFP_ATOMIC, PG_ANY); - - if (!buffer) - return -ENOMEM; - - handle->buffer = buffer; - } - handle->sync_read = 1; - if (handle->prev < handle->cur) { - if (handle->prev == 0) { - error = load_header(buffer); - if (error) - return error; - - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); - if (error) - return error; - - } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); - if (handle->prev == nr_meta_pages) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; - } - } else { - copy_last_highmem_page(); - handle->buffer = get_buffer(&orig_bm, &ca); - if (handle->buffer != buffer) - handle->sync_read = 0; - } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; - } - handle->offset += count; - return count; -} - -/** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. - */ - -void snapshot_write_finalize(struct snapshot_handle *handle) -{ - copy_last_highmem_page(); - /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); - free_highmem_data(); - } -} - -int snapshot_image_loaded(struct snapshot_handle *handle) -{ - return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); -} - -#ifdef CONFIG_HIGHMEM -/* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) -{ - void *kaddr1, *kaddr2; - - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); - kunmap_atomic(kaddr2, KM_USER1); -} - -/** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. - * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. - */ - -int restore_highmem(void) -{ - struct highmem_pbe *pbe = highmem_pblist; - void *buf; - - if (!pbe) - return 0; - - buf = get_image_page(GFP_ATOMIC, PG_SAFE); - if (!buf) - return -ENOMEM; - - while (pbe) { - swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); - pbe = pbe->next; - } - free_image_page(buf, PG_UNSAFE_CLEAR); - return 0; -} -#endif /* CONFIG_HIGHMEM */ -/* - * linux/kernel/power/swap.c - * - * This file provides functions for reading the suspend image from - * and writing it to a swap partition. - * - * Copyright (C) 1998,2001-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -#define SWSUSP_SIG "S1SUSPEND" - -struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; - sector_t image; - unsigned int flags; /* Flags to pass to the "boot" kernel */ - char orig_sig[10]; - char sig[10]; -} __attribute__((packed)); - -static struct swsusp_header *swsusp_header; - -/* - * General things - */ - -static unsigned short root_swap = 0xffff; -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %ld\n", - page_off); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, page_off, virt_to_page(addr), bio_chain); -} - -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} - -/* - * Saving part - */ - -static int mark_swapfiles(sector_t start, unsigned int flags) -{ - int error; - - bio_read_page(swsusp_resume_block, swsusp_header, NULL); - if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { - memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig,SWSUSP_SIG, 10); - swsusp_header->image = start; - swsusp_header->flags = flags; - error = bio_write_page(swsusp_resume_block, - swsusp_header, NULL); - } else { - printk(KERN_ERR "PM: Swap header not found!\n"); - error = -ENODEV; - } - return error; -} - -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - */ - -static int swsusp_swap_check(void) /* This is called before saving image */ -{ - int res; - - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &resume_bdev); - if (res < 0) - return res; - - root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); - if (res) - return res; - - res = set_blocksize(resume_bdev, PAGE_SIZE); - if (res < 0) - blkdev_put(resume_bdev); - - return res; -} - -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here - */ - -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) -{ - void *src; - - if (!offset) - return -ENOSPC; - - if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (src) { - memcpy(src, buf, PAGE_SIZE); - } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; - } - } else { - src = buf; - } - return bio_write_page(offset, src, bio_chain); -} - -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we also only need to use one swap_map_page structure - * at a time. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) - -struct swap_map_page { - sector_t entries[MAP_PAGE_ENTRIES]; - sector_t next_swap; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - sector_t cur_swap; - unsigned int k; -}; - -static void release_swap_writer(struct swap_map_handle *handle) -{ - if (handle->cur) - free_page((unsigned long)handle->cur); - handle->cur = NULL; -} - -static int get_swap_writer(struct swap_map_handle *handle) -{ - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) - return -ENOMEM; - handle->cur_swap = alloc_swapdev_block(root_swap); - if (!handle->cur_swap) { - release_swap_writer(handle); - return -ENOSPC; - } - handle->k = 0; - return 0; -} - -static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) -{ - int error = 0; - sector_t offset; - - if (!handle->cur) - return -EINVAL; - offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); - if (error) - return error; - handle->cur->entries[handle->k++] = offset; - if (handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); - if (error) - goto out; - offset = alloc_swapdev_block(root_swap); - if (!offset) - return -ENOSPC; - handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); - if (error) - goto out; - memset(handle->cur, 0, PAGE_SIZE); - handle->cur_swap = offset; - handle->k = 0; - } - out: - return error; -} - -static int flush_swap_writer(struct swap_map_handle *handle) -{ - if (handle->cur && handle->cur_swap) - return write_page(handle->cur, handle->cur_swap, NULL); - else - return -EINVAL; -} - -/** - * save_image - save the suspend image data - */ - -static int save_image(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) -{ - unsigned int m; - int ret; - int error = 0; - int nr_pages; - int err2; - struct bio *bio; - struct timeval start; - struct timeval stop; - - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", - nr_to_write); - m = nr_to_write / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - do { - ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); - err2 = wait_on_bio_chain(&bio); - do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) - printk("\b\b\b\bdone\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; -} - -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. - */ - -static int enough_swap(unsigned int nr_pages) -{ - unsigned int free_swap = count_swap_pages(root_swap, 1); - - pr_debug("PM: Free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; -} - -/** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header - * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) - */ - -int swsusp_write(unsigned int flags) -{ - struct swap_map_handle handle; - struct snapshot_handle snapshot; - struct swsusp_info *header; - int error; - - error = swsusp_swap_check(); - if (error) { - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); - return error; - } - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot, PAGE_SIZE); - if (error < PAGE_SIZE) { - if (error >= 0) - error = -EFAULT; - - goto out; - } - header = (struct swsusp_info *)data_of(snapshot); - if (!enough_swap(header->pages)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out; - } - error = get_swap_writer(&handle); - if (!error) { - sector_t start = handle.cur_swap; - - error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, - header->pages - 1); - - if (!error) { - flush_swap_writer(&handle); - printk(KERN_INFO "PM: S"); - error = mark_swapfiles(start, flags); - printk("|\n"); - } - } - if (error) - free_all_swap_pages(root_swap); - - release_swap_writer(&handle); - out: - swsusp_close(); - return error; -} - -/** - * The following functions allow us to read data using a swap map - * in a file-alike way - */ - -static void release_swap_reader(struct swap_map_handle *handle) -{ - if (handle->cur) - free_page((unsigned long)handle->cur); - handle->cur = NULL; -} - -static int get_swap_reader(struct swap_map_handle *handle, sector_t start) -{ - int error; - - if (!start) - return -EINVAL; - - handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); - if (!handle->cur) - return -ENOMEM; - - error = bio_read_page(start, handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; - } - handle->k = 0; - return 0; -} - -static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) -{ - sector_t offset; - int error; - - if (!handle->cur) - return -EINVAL; - offset = handle->cur->entries[handle->k]; - if (!offset) - return -EFAULT; - error = bio_read_page(offset, buf, bio_chain); - if (error) - return error; - if (++handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); - handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) - release_swap_reader(handle); - else if (!error) - error = bio_read_page(offset, handle->cur, NULL); - } - return error; -} - -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - -static int load_image(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) -{ - unsigned int m; - int error = 0; - struct timeval start; - struct timeval stop; - struct bio *bio; - int err2; - unsigned nr_pages; - - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", - nr_to_read); - m = nr_to_read / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - for ( ; ; ) { - error = snapshot_write_next(snapshot, PAGE_SIZE); - if (error <= 0) - break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) - break; - if (snapshot->sync_read) - error = wait_on_bio_chain(&bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - err2 = wait_on_bio_chain(&bio); - do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { - printk("\b\b\b\bdone\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; - } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return error; -} - -/** - * swsusp_read - read the hibernation image. - * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memeory location - */ - -int swsusp_read(unsigned int *flags_p) -{ - int error; - struct swap_map_handle handle; - struct snapshot_handle snapshot; - struct swsusp_info *header; - - *flags_p = swsusp_header->flags; - if (IS_ERR(resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return PTR_ERR(resume_bdev); - } - - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot, PAGE_SIZE); - if (error < PAGE_SIZE) - return error < 0 ? error : -EFAULT; - header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header->image); - if (!error) - error = swap_read_page(&handle, header, NULL); - if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); - release_swap_reader(&handle); - - blkdev_put(resume_bdev); - - if (!error) - pr_debug("PM: Image successfully loaded\n"); - else - pr_debug("PM: Error %d resuming\n", error); - return error; -} - -/** - * swsusp_check - Check for swsusp signature in the resume device - */ - -int swsusp_check(void) -{ - int error; - - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); - error = bio_read_page(swsusp_resume_block, - swsusp_header, NULL); - if (error) - return error; - - if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { - memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); - /* Reset swap signature now */ - error = bio_write_page(swsusp_resume_block, - swsusp_header, NULL); - } else { - return -EINVAL; - } - if (error) - blkdev_put(resume_bdev); - else - pr_debug("PM: Signature found, resuming\n"); - } else { - error = PTR_ERR(resume_bdev); - } - - if (error) - pr_debug("PM: Error %d checking image file\n", error); - - return error; -} - -/** - * swsusp_close - close swap device. - */ - -void swsusp_close(void) -{ - if (IS_ERR(resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return; - } - - blkdev_put(resume_bdev); -} - -static int swsusp_header_init(void) -{ - swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); - if (!swsusp_header) - panic("Could not allocate memory for swsusp_header\n"); - return 0; -} - -core_initcall(swsusp_header_init); -/* - * linux/kernel/power/swsusp.c - * - * This file provides code to write suspend image to swap and read it back. - * - * Copyright (C) 1998-2001 Gabor Kuti - * Copyright (C) 1998,2001-2005 Pavel Machek - * - * This file is released under the GPLv2. - * - * I'd like to thank the following people for their work: - * - * Pavel Machek : - * Modifications, defectiveness pointing, being with me at the very beginning, - * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. - * - * Steve Doddi : - * Support the possibility of hardware state restoring. - * - * Raph : - * Support for preserving states of network devices and virtual console - * (including X and svgatextmode) - * - * Kurt Garloff : - * Straightened the critical function in order to prevent compilers from - * playing tricks with local variables. - * - * Andreas Mohr - * - * Alex Badea : - * Fixed runaway init - * - * Rafael J. Wysocki - * Reworked the freeing of memory and the handling of swap - * - * More state savers are welcome. Especially for the scsi layer... - * - * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size = 500 * 1024 * 1024; - -int in_suspend __nosavedata = 0; - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - */ - -struct swsusp_extent { - struct rb_node node; - unsigned long start; - unsigned long end; -}; - -static struct rb_root swsusp_extents = RB_ROOT; - -static int swsusp_extents_insert(unsigned long swap_offset) -{ - struct rb_node **new = &(swsusp_extents.rb_node); - struct rb_node *parent = NULL; - struct swsusp_extent *ext; - - /* Figure out where to put the new node */ - while (*new) { - ext = container_of(*new, struct swsusp_extent, node); - parent = *new; - if (swap_offset < ext->start) { - /* Try to merge */ - if (swap_offset == ext->start - 1) { - ext->start--; - return 0; - } - new = &((*new)->rb_left); - } else if (swap_offset > ext->end) { - /* Try to merge */ - if (swap_offset == ext->end + 1) { - ext->end++; - return 0; - } - new = &((*new)->rb_right); - } else { - /* It already is in the tree */ - return -EINVAL; - } - } - /* Add the new node and rebalance the tree. */ - ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); - if (!ext) - return -ENOMEM; - - ext->start = swap_offset; - ext->end = swap_offset; - rb_link_node(&ext->node, parent, new); - rb_insert_color(&ext->node, &swsusp_extents); - return 0; -} - -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - -sector_t alloc_swapdev_block(int swap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); - else - return swapdev_block(swap, offset); - } - return 0; -} - -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been - * allocated. - */ - -void free_all_swap_pages(int swap) -{ - struct rb_node *node; - - while ((node = swsusp_extents.rb_node)) { - struct swsusp_extent *ext; - unsigned long offset; - - ext = container_of(node, struct swsusp_extent, node); - rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); - - kfree(ext); - } -} - -int swsusp_swap_in_use(void) -{ - return (swsusp_extents.rb_node != NULL); -} - -/** - * swsusp_show_speed - print the time elapsed between two events represented by - * @start and @stop - * - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print - */ - -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - -/** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. - */ - -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) -{ - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); -} - -int swsusp_shrink_memory(void) -{ - long tmp; - struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; - struct timeval start, stop; - - printk(KERN_INFO "PM: Shrinking memory... "); - do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_zone (zone) - if (populated_zone(zone)) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - - if (highmem_size < 0) - highmem_size = 0; - - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); - do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); - - return 0; -} -/* - * linux/kernel/power/user.c - * - * This file provides the user space interface for software suspend/resume. - * - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "power.h" - -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - - -#define SNAPSHOT_MINOR 231 - -static struct snapshot_data { - struct snapshot_handle handle; - int swap; - int mode; - char frozen; - char ready; - char platform_support; -} snapshot_state; - -atomic_t snapshot_device_available = ATOMIC_INIT(1); - -static int snapshot_open(struct inode *inode, struct file *filp) -{ - struct snapshot_data *data; - int error; - - mutex_lock(&pm_mutex); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); - error = -ENOSYS; - goto Unlock; - } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } - nonseekable_open(inode, filp); - data = &snapshot_state; - filp->private_data = data; - memset(&data->handle, 0, sizeof(struct snapshot_handle)); - if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; - data->mode = O_RDONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_RESTORE); - } else { - data->swap = -1; - data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); - } - if (error) - atomic_inc(&snapshot_device_available); - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; - - Unlock: - mutex_unlock(&pm_mutex); - - return error; -} - -static int snapshot_release(struct inode *inode, struct file *filp) -{ - struct snapshot_data *data; - - mutex_lock(&pm_mutex); - - swsusp_free(); - free_basic_memory_bitmaps(); - data = filp->private_data; - free_all_swap_pages(data->swap); - if (data->frozen) - thaw_processes(); - pm_notifier_call_chain(data->mode == O_RDONLY ? - PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); - - mutex_unlock(&pm_mutex); - - return 0; -} - -static ssize_t snapshot_read(struct file *filp, char __user *buf, - size_t count, loff_t *offp) -{ - struct snapshot_data *data; - ssize_t res; - - mutex_lock(&pm_mutex); - - data = filp->private_data; - if (!data->ready) { - res = -ENODATA; - goto Unlock; - } - res = snapshot_read_next(&data->handle, count); - if (res > 0) { - if (copy_to_user(buf, data_of(data->handle), res)) - res = -EFAULT; - else - *offp = data->handle.offset; - } - - Unlock: - mutex_unlock(&pm_mutex); - - return res; -} - -static ssize_t snapshot_write(struct file *filp, const char __user *buf, - size_t count, loff_t *offp) -{ - struct snapshot_data *data; - ssize_t res; - - mutex_lock(&pm_mutex); - - data = filp->private_data; - res = snapshot_write_next(&data->handle, count); - if (res > 0) { - if (copy_from_user(data_of(data->handle), buf, res)) - res = -EFAULT; - else - *offp = data->handle.offset; - } - - mutex_unlock(&pm_mutex); - - return res; -} - -static long snapshot_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - int error = 0; - struct snapshot_data *data; - loff_t size; - sector_t offset; - - if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) - return -ENOTTY; - if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) - return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - data = filp->private_data; - - switch (cmd) { - - case SNAPSHOT_FREEZE: - if (data->frozen) - break; - printk("Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = freeze_processes(); - if (error) - thaw_processes(); - if (!error) - data->frozen = 1; - break; - - case SNAPSHOT_UNFREEZE: - if (!data->frozen || data->ready) - break; - thaw_processes(); - data->frozen = 0; - break; - - case SNAPSHOT_CREATE_IMAGE: - case SNAPSHOT_ATOMIC_SNAPSHOT: - if (data->mode != O_RDONLY || !data->frozen || data->ready) { - error = -EPERM; - break; - } - error = hibernation_snapshot(data->platform_support); - if (!error) - error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; - break; - - case SNAPSHOT_ATOMIC_RESTORE: - snapshot_write_finalize(&data->handle); - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { - error = -EPERM; - break; - } - error = hibernation_restore(data->platform_support); - break; - - case SNAPSHOT_FREE: - swsusp_free(); - memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; - break; - - case SNAPSHOT_PREF_IMAGE_SIZE: - case SNAPSHOT_SET_IMAGE_SIZE: - image_size = arg; - break; - - case SNAPSHOT_GET_IMAGE_SIZE: - if (!data->ready) { - error = -ENODATA; - break; - } - size = snapshot_get_image_size(); - size <<= PAGE_SHIFT; - error = put_user(size, (loff_t __user *)arg); - break; - - case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_AVAIL_SWAP: - size = count_swap_pages(data->swap, 1); - size <<= PAGE_SHIFT; - error = put_user(size, (loff_t __user *)arg); - break; - - case SNAPSHOT_ALLOC_SWAP_PAGE: - case SNAPSHOT_GET_SWAP_PAGE: - if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { - error = -ENODEV; - break; - } - offset = alloc_swapdev_block(data->swap); - if (offset) { - offset <<= PAGE_SHIFT; - error = put_user(offset, (loff_t __user *)arg); - } else { - error = -ENOSPC; - } - break; - - case SNAPSHOT_FREE_SWAP_PAGES: - if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { - error = -ENODEV; - break; - } - free_all_swap_pages(data->swap); - break; - - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - - case SNAPSHOT_S2RAM: - if (!data->frozen) { - error = -EPERM; - break; - } - /* - * Tasks are frozen and the notifiers have been called with - * PM_HIBERNATION_PREPARE - */ - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - break; - - case SNAPSHOT_PLATFORM_SUPPORT: - data->platform_support = !!arg; - break; - - case SNAPSHOT_POWER_OFF: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - - case SNAPSHOT_SET_SWAP_AREA: - if (swsusp_swap_in_use()) { - error = -EPERM; - } else { - struct resume_swap_area swap_area; - dev_t swdev; - - error = copy_from_user(&swap_area, (void __user *)arg, - sizeof(struct resume_swap_area)); - if (error) { - error = -EFAULT; - break; - } - - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - swdev = old_decode_dev(swap_area.dev); - if (swdev) { - offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } - break; - - default: - error = -ENOTTY; - - } - - mutex_unlock(&pm_mutex); - - return error; -} - -static const struct file_operations snapshot_fops = { - .open = snapshot_open, - .release = snapshot_release, - .read = snapshot_read, - .write = snapshot_write, - .llseek = no_llseek, - .unlocked_ioctl = snapshot_ioctl, -}; - -static struct miscdevice snapshot_device = { - .minor = SNAPSHOT_MINOR, - .name = "snapshot", - .fops = &snapshot_fops, -}; - -static int __init snapshot_device_init(void) -{ - return misc_register(&snapshot_device); -}; - -device_initcall(snapshot_device_init); -/* - * linux/kernel/printk.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Modified to make sys_syslog() more flexible: added commands to - * return the last 4k of kernel messages, regardless of whether - * they've been read or not. Added option to suppress kernel printk's - * to the console. Added hook for sending the console messages - * elsewhere, in preparation for a serial line console (someday). - * Ted Ts'o, 2/11/93. - * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul - * manfred@colorfullife.com - * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For in_interrupt() */ -#include -#include -#include -#include -#include - -#include - -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) - -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - -DECLARE_WAIT_QUEUE_HEAD(log_wait); - -int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ -}; - -/* - * Low level drivers may need that to know if they can schedule in - * their unblank() callback or not. So let's export it. - */ -int oops_in_progress; -EXPORT_SYMBOL(oops_in_progress); - -/* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console - * driver system. - */ -static DECLARE_MUTEX(console_sem); -static DECLARE_MUTEX(secondary_console_sem); -struct console *console_drivers; -EXPORT_SYMBOL_GPL(console_drivers); - -/* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held - */ -static int console_locked, console_suspended; - -/* - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars - * It is also used in interesting ways to provide interlocking in - * release_console_sem(). - */ -static DEFINE_SPINLOCK(logbuf_lock); - -#define LOG_BUF_MASK (log_buf_len-1) -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) - -/* - * The indices into log_buf are not constrained to log_buf_len - they - * must be masked before subscripting - */ -static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ -static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ -static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ - -/* - * Array of consoles built from command line options (console=) - */ -struct console_cmdline -{ - char name[8]; /* Name of the driver */ - int index; /* Minor dev. to use */ - char *options; /* Options for the driver */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - char *brl_options; /* Options for braille driver */ -#endif -}; - -#define MAX_CMDLINECONSOLES 8 - -static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int selected_console = -1; -static int preferred_console = -1; -int console_set_on_cmdline; -EXPORT_SYMBOL(console_set_on_cmdline); - -/* Flag: console code may call schedule() */ -static int console_may_schedule; - -#ifdef CONFIG_PRINTK - -static char __log_buf[__LOG_BUF_LEN]; -static char *log_buf = __log_buf; -static int log_buf_len = __LOG_BUF_LEN; -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ - -static int __init log_buf_len_setup(char *str) -{ - unsigned size = memparse(str, &str); - unsigned long flags; - - if (size) - size = roundup_pow_of_two(size); - if (size > log_buf_len) { - unsigned start, dest_idx, offset; - char *new_log_buf; - - new_log_buf = alloc_bootmem(size); - if (!new_log_buf) { - printk(KERN_WARNING "log_buf_len: allocation failed\n"); - goto out; - } - - spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = size; - log_buf = new_log_buf; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); - - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); - } -out: - return 1; -} - -__setup("log_buf_len=", log_buf_len_setup); - -#ifdef CONFIG_BOOT_PRINTK_DELAY - -static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ - -static int __init boot_delay_setup(char *str) -{ - unsigned long lpj; - unsigned long long loops_per_msec; - - lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ - loops_per_msec = (unsigned long long)lpj / 1000 * HZ; - - get_option(&str, &boot_delay); - if (boot_delay > 10 * 1000) - boot_delay = 0; - - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); - return 1; -} -__setup("boot_delay=", boot_delay_setup); - -static void boot_delay_msec(void) -{ - unsigned long long k; - unsigned long timeout; - - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) - return; - - k = (unsigned long long)printk_delay_msec * boot_delay; - - timeout = jiffies + msecs_to_jiffies(boot_delay); - while (k) { - k--; - cpu_relax(); - /* - * use (volatile) jiffies to prevent - * compiler reduction; loop termination via jiffies - * is secondary and may or may not happen. - */ - if (time_after(jiffies, timeout)) - break; - touch_nmi_watchdog(); - } -} -#else -static inline void boot_delay_msec(void) -{ -} -#endif - -/* - * Return the number of unread characters in the log buffer. - */ -static int log_buf_get_len(void) -{ - return logged_chars; -} - -/* - * Copy a range of characters from the log buffer. - */ -int log_buf_copy(char *dest, int idx, int len) -{ - int ret, max; - bool took_lock = false; - - if (!oops_in_progress) { - spin_lock_irq(&logbuf_lock); - took_lock = true; - } - - max = log_buf_get_len(); - if (idx < 0 || idx >= max) { - ret = -1; - } else { - if (len > max) - len = max; - ret = len; - idx += (log_end - max); - while (len-- > 0) - dest[len] = LOG_BUF(idx + len); - } - - if (took_lock) - spin_unlock_irq(&logbuf_lock); - - return ret; -} - -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ -int do_syslog(int type, char __user *buf, int len) -{ - unsigned i, j, limit, count; - int do_clear = 0; - char c; - int error = 0; - - error = security_syslog(type); - if (error) - return error; - - switch (type) { - case 0: /* Close log */ - break; - case 1: /* Open log */ - break; - case 2: /* Read from log */ - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); - if (error) - goto out; - i = 0; - spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; - spin_unlock_irq(&logbuf_lock); - error = __put_user(c,buf); - buf++; - i++; - cond_resched(); - spin_lock_irq(&logbuf_lock); - } - spin_unlock_irq(&logbuf_lock); - if (!error) - error = i; - break; - case 4: /* Read/clear last kernel messages */ - do_clear = 1; - /* FALL THRU */ - case 3: /* Read last kernel messages */ - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - count = len; - if (count > log_buf_len) - count = log_buf_len; - spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; - if (do_clear) - logged_chars = 0; - limit = log_end; - /* - * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages - * we try to copy to user space. Therefore - * the messages are copied in reverse. - */ - for (i = 0; i < count && !error; i++) { - j = limit-1-i; - if (j + log_buf_len < log_end) - break; - c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); - error = __put_user(c,&buf[count-1-i]); - cond_resched(); - spin_lock_irq(&logbuf_lock); - } - spin_unlock_irq(&logbuf_lock); - if (error) - break; - error = i; - if (i != count) { - int offset = count-error; - /* buffer overflow during copy, correct user buffer. */ - for (i = 0; i < error; i++) { - if (__get_user(c,&buf[i+offset]) || - __put_user(c,&buf[i])) { - error = -EFAULT; - break; - } - cond_resched(); - } - } - break; - case 5: /* Clear ring buffer */ - logged_chars = 0; - break; - case 6: /* Disable logging to console */ - console_loglevel = minimum_console_loglevel; - break; - case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; - break; - case 8: /* Set level of messages printed to console */ - error = -EINVAL; - if (len < 1 || len > 8) - goto out; - if (len < minimum_console_loglevel) - len = minimum_console_loglevel; - console_loglevel = len; - error = 0; - break; - case 9: /* Number of chars in the log buffer */ - error = log_end - log_start; - break; - case 10: /* Size of the log buffer */ - error = log_buf_len; - break; - default: - error = -EINVAL; - break; - } -out: - return error; -} - -SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) -{ - return do_syslog(type, buf, len); -} - -/* - * Call the console drivers on a range of log_buf - */ -static void __call_console_drivers(unsigned start, unsigned end) -{ - struct console *con; - - for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) - con->write(con, &LOG_BUF(start), end - start); - } -} - -static int __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); - -/* - * Write out chars from start to end - 1 inclusive - */ -static void _call_console_drivers(unsigned start, - unsigned end, int msg_log_level) -{ - if ((msg_log_level < console_loglevel || ignore_loglevel) && - console_drivers && start != end) { - if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { - /* wrapped write */ - __call_console_drivers(start & LOG_BUF_MASK, - log_buf_len); - __call_console_drivers(0, end & LOG_BUF_MASK); - } else { - __call_console_drivers(start, end); - } - } -} - -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_sem must be held. - */ -static void call_console_drivers(unsigned start, unsigned end) -{ - unsigned cur_index, start_print; - static int msg_level = -1; - - BUG_ON(((int)(start - end)) > 0); - - cur_index = start; - start_print = start; - while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') { - msg_level = LOG_BUF(cur_index + 1) - '0'; - cur_index += 3; - start_print = cur_index; - } - while (cur_index != end) { - char c = LOG_BUF(cur_index); - - cur_index++; - if (c == '\n') { - if (msg_level < 0) { - /* - * printk() has already given us loglevel tags in - * the buffer. This code is here in case the - * log buffer has wrapped right round and scribbled - * on those tags - */ - msg_level = default_message_loglevel; - } - _call_console_drivers(start_print, cur_index, msg_level); - msg_level = -1; - start_print = cur_index; - break; - } - } - } - _call_console_drivers(start_print, end, msg_level); -} - -static void emit_log_char(char c) -{ - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; -} - -/* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - init_MUTEX(&console_sem); -} - -#if defined(CONFIG_PRINTK_TIME) -static int printk_time = 1; -#else -static int printk_time = 0; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -/* Check if we have any console registered that can be called early in boot. */ -static int have_callable_console(void) -{ - struct console *con; - - for (con = console_drivers; con; con = con->next) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. - * - * We try to grab the console_sem. If we succeed, it's easy - we log the output and - * call the console drivers. If we fail to get the semaphore we place the output - * into the log buffer and return. The current holder of the console_sem will - * notice the new output in release_console_sem() and will send it to the - * consoles before releasing the semaphore. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - */ - -asmlinkage int printk(const char *fmt, ...) -{ - va_list args; - int r; - - va_start(args, fmt); - r = vprintk(fmt, args); - va_end(args); - - return r; -} - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int printk_cpu = UINT_MAX; - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_semaphore held, and 'console_locked' set) if it - * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. - */ -static int acquire_console_semaphore_for_printk(unsigned int cpu) -{ - int retval = 0; - - if (!try_acquire_console_sem()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up(&console_sem); - retval = 0; - } - } - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - return retval; -} -static const char recursion_bug_msg [] = - KERN_CRIT "BUG: recent printk recursion!\n"; -static int recursion_bug; - static int new_text_line = 1; -static char printk_buf[1024]; - -asmlinkage int vprintk(const char *fmt, va_list args) -{ - int printed_len = 0; - int current_log_level = default_message_loglevel; - unsigned long flags; - int this_cpu; - char *p; - - boot_delay_msec(); - - preempt_disable(); - /* This stops the holder of console_sem just where we want him */ - raw_local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(printk_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress) { - recursion_bug = 1; - goto out_restore_irqs; - } - zap_locks(); - } - - lockdep_off(); - spin_lock(&logbuf_lock); - printk_cpu = this_cpu; - - if (recursion_bug) { - recursion_bug = 0; - strcpy(printk_buf, recursion_bug_msg); - printed_len = sizeof(recursion_bug_msg); - } - /* Emit the output into the temporary buffer */ - printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf) - printed_len, fmt, args); - - - /* - * Copy the output into log_buf. If the caller didn't provide - * appropriate log level tags, we insert them here - */ - for (p = printk_buf; *p; p++) { - if (new_text_line) { - /* If a token, set current_log_level and skip over */ - if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' && - p[2] == '>') { - current_log_level = p[1] - '0'; - p += 3; - printed_len -= 3; - } - - /* Always output the token */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; - new_text_line = 0; - - if (printk_time) { - /* Follow the token with the time */ - char tbuf[50], *tp; - unsigned tlen; - unsigned long long t; - unsigned long nanosec_rem; - - t = cpu_clock(printk_cpu); - nanosec_rem = do_div(t, 1000000000); - tlen = sprintf(tbuf, "[%5lu.%06lu] ", - (unsigned long) t, - nanosec_rem / 1000); - - for (tp = tbuf; tp < tbuf + tlen; tp++) - emit_log_char(*tp); - printed_len += tlen; - } - - if (!*p) - break; - } - - emit_log_char(*p); - if (*p == '\n') - new_text_line = 1; - } - - /* - * Try to acquire and then immediately release the - * console semaphore. The release will do all the - * actual magic (print out buffers, wake up klogd, - * etc). - * - * The acquire_console_semaphore_for_printk() function - * will release 'logbuf_lock' regardless of whether it - * actually gets the semaphore or not. - */ - if (acquire_console_semaphore_for_printk(this_cpu)) - release_console_sem(); - - lockdep_on(); -out_restore_irqs: - raw_local_irq_restore(flags); - - preempt_enable(); - return printed_len; -} -EXPORT_SYMBOL(printk); -EXPORT_SYMBOL(vprintk); - -#else - -static void call_console_drivers(unsigned start, unsigned end) -{ -} - -#endif - -static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - if (!brl_options) - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - if (!brl_options) - selected_console = i; - c = &console_cmdline[i]; - strlcpy(c->name, name, sizeof(c->name)); - c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - c->brl_options = brl_options; -#endif - c->index = idx; - return 0; -} -/* - * Set up a list of consoles. Called from init/main.c - */ -static int __init console_setup(char *str) -{ - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ - char *s, *options, *brl_options = NULL; - int idx; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (!memcmp(str, "brl,", 4)) { - brl_options = ""; - str += 4; - } else if (!memcmp(str, "brl=", 4)) { - brl_options = str + 4; - str = strchr(brl_options, ','); - if (!str) { - printk(KERN_ERR "need port name after brl=\n"); - return 1; - } - *(str++) = 0; - } -#endif - - /* - * Decode str into name, index, options. - */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) - *(options++) = 0; -#ifdef __sparc__ - if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); - if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); -#endif - for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') - break; - idx = simple_strtoul(s, NULL, 10); - *s = 0; - - __add_preferred_console(buf, idx, options, brl_options); - console_set_on_cmdline = 1; - return 1; -} -__setup("console=", console_setup); - -/** - * add_preferred_console - add a device to the list of preferred consoles. - * @name: device name - * @idx: device index - * @options: options for this console - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int add_preferred_console(char *name, int idx, char *options) -{ - return __add_preferred_console(name, idx, options, NULL); -} - -int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) -{ - struct console_cmdline *c; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - c = &console_cmdline[i]; - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; - } - /* not found */ - return -1; -} - -int console_suspend_enabled = 1; -EXPORT_SYMBOL(console_suspend_enabled); - -static int __init console_suspend_disable(char *str) -{ - console_suspend_enabled = 0; - return 1; -} -__setup("no_console_suspend", console_suspend_disable); - -/** - * suspend_console - suspend the console subsystem - * - * This disables printk() while we go into suspend states - */ -void suspend_console(void) -{ - if (!console_suspend_enabled) - return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); - acquire_console_sem(); - console_suspended = 1; -} - -void resume_console(void) -{ - if (!console_suspend_enabled) - return; - console_suspended = 0; - release_console_sem(); -} - -/** - * acquire_console_sem - lock the console system for exclusive use. - * - * Acquires a semaphore which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. - * - * Can sleep, returns nothing. - */ -void acquire_console_sem(void) -{ - BUG_ON(in_interrupt()); - if (console_suspended) { - down(&secondary_console_sem); - return; - } - down(&console_sem); - console_locked = 1; - console_may_schedule = 1; -} -EXPORT_SYMBOL(acquire_console_sem); - -int try_acquire_console_sem(void) -{ - if (down_trylock(&console_sem)) - return -1; - console_locked = 1; - console_may_schedule = 0; - return 0; -} -EXPORT_SYMBOL(try_acquire_console_sem); - -int is_console_locked(void) -{ - return console_locked; -} - -static DEFINE_PER_CPU(int, printk_pending); - -void printk_tick(void) -{ - if (__get_cpu_var(printk_pending)) { - __get_cpu_var(printk_pending) = 0; - wake_up_interruptible(&log_wait); - } -} - -int printk_needs_cpu(int cpu) -{ - return per_cpu(printk_pending, cpu); -} - -void wake_up_klogd(void) -{ - if (waitqueue_active(&log_wait)) - __raw_get_cpu_var(printk_pending) = 1; -} - -/** - * release_console_sem - unlock the console system - * - * Releases the semaphore which the caller holds on the console system - * and the console driver list. - * - * While the semaphore was held, console output may have been buffered - * by printk(). If this is the case, release_console_sem() emits - * the output prior to releasing the semaphore. - * - * If there is output waiting for klogd, we wake it up. - * - * release_console_sem() may be called from any context. - */ -void release_console_sem(void) -{ - unsigned long flags; - unsigned _con_start, _log_end; - unsigned wake_klogd = 0; - - if (console_suspended) { - up(&secondary_console_sem); - return; - } - - console_may_schedule = 0; - - for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); - wake_klogd |= log_start - log_end; - if (con_start == log_end) - break; /* Nothing to print */ - _con_start = con_start; - _log_end = log_end; - con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(_con_start, _log_end); - start_critical_timings(); - local_irq_restore(flags); - } - console_locked = 0; - up(&console_sem); - spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd) - wake_up_klogd(); -} -EXPORT_SYMBOL(release_console_sem); - -/** - * console_conditional_schedule - yield the CPU if required - * - * If the console code is currently allowed to sleep, and - * if this CPU should yield the CPU to another task, do - * so here. - * - * Must be called within acquire_console_sem(). - */ -void __sched console_conditional_schedule(void) -{ - if (console_may_schedule) - cond_resched(); -} -EXPORT_SYMBOL(console_conditional_schedule); - -void console_print(const char *s) -{ - printk(KERN_EMERG "%s", s); -} -EXPORT_SYMBOL(console_print); - -void console_unblank(void) -{ - struct console *c; - - /* - * console_unblank can no longer be called in interrupt context unless - * oops_in_progress is set to 1.. - */ - if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) - return; - } else - acquire_console_sem(); - - console_locked = 1; - console_may_schedule = 0; - for (c = console_drivers; c != NULL; c = c->next) - if ((c->flags & CON_ENABLED) && c->unblank) - c->unblank(); - release_console_sem(); -} - -/* - * Return the console tty driver structure and its associated index - */ -struct tty_driver *console_device(int *index) -{ - struct console *c; - struct tty_driver *driver = NULL; - - acquire_console_sem(); - for (c = console_drivers; c != NULL; c = c->next) { - if (!c->device) - continue; - driver = c->device(c, index); - if (driver) - break; - } - release_console_sem(); - return driver; -} - -/* - * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can - * re-enable output afterwards. - */ -void console_stop(struct console *console) -{ - acquire_console_sem(); - console->flags &= ~CON_ENABLED; - release_console_sem(); -} -EXPORT_SYMBOL(console_stop); - -void console_start(struct console *console) -{ - acquire_console_sem(); - console->flags |= CON_ENABLED; - release_console_sem(); -} -EXPORT_SYMBOL(console_start); - -/* - * The console driver calls this routine during kernel initialization - * to register the console printing procedure with printk() and to - * print any messages that were printed by the kernel before the - * console driver was initialized. - */ -void register_console(struct console *console) -{ - int i; - unsigned long flags; - struct console *bootconsole = NULL; - - if (console_drivers) { - if (console->flags & CON_BOOT) - return; - if (console_drivers->flags & CON_BOOT) - bootconsole = console_drivers; - } - - if (preferred_console < 0 || bootconsole || !console_drivers) - preferred_console = selected_console; - - if (console->early_setup) - console->early_setup(); - - /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. - */ - if (preferred_console < 0) { - if (console->index < 0) - console->index = 0; - if (console->setup == NULL || - console->setup(console, NULL) == 0) { - console->flags |= CON_ENABLED; - if (console->device) { - console->flags |= CON_CONSDEV; - preferred_console = 0; - } - } - } - - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; - i++) { - if (strcmp(console_cmdline[i].name, console->name) != 0) - continue; - if (console->index >= 0 && - console->index != console_cmdline[i].index) - continue; - if (console->index < 0) - console->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console_cmdline[i].brl_options) { - console->flags |= CON_BRL; - braille_register_console(console, - console_cmdline[i].index, - console_cmdline[i].options, - console_cmdline[i].brl_options); - return; - } -#endif - if (console->setup && - console->setup(console, console_cmdline[i].options) != 0) - break; - console->flags |= CON_ENABLED; - console->index = console_cmdline[i].index; - if (i == selected_console) { - console->flags |= CON_CONSDEV; - preferred_console = selected_console; - } - break; - } - - if (!(console->flags & CON_ENABLED)) - return; - - if (bootconsole && (console->flags & CON_CONSDEV)) { - printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", - bootconsole->name, bootconsole->index, - console->name, console->index); - unregister_console(bootconsole); - console->flags &= ~CON_PRINTBUFFER; - } else { - printk(KERN_INFO "console [%s%d] enabled\n", - console->name, console->index); - } - - /* - * Put this console in the list - keep the - * preferred driver at the head of the list. - */ - acquire_console_sem(); - if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { - console->next = console_drivers; - console_drivers = console; - if (console->next) - console->next->flags &= ~CON_CONSDEV; - } else { - console->next = console_drivers->next; - console_drivers->next = console; - } - if (console->flags & CON_PRINTBUFFER) { - /* - * release_console_sem() will print out the buffered messages - * for us. - */ - spin_lock_irqsave(&logbuf_lock, flags); - con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); - } - release_console_sem(); -} -EXPORT_SYMBOL(register_console); - -int unregister_console(struct console *console) -{ - struct console *a, *b; - int res = 1; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console->flags & CON_BRL) - return braille_unregister_console(console); -#endif - - acquire_console_sem(); - if (console_drivers == console) { - console_drivers=console->next; - res = 0; - } else if (console_drivers) { - for (a=console_drivers->next, b=console_drivers ; - a; b=a, a=b->next) { - if (a == console) { - b->next = a->next; - res = 0; - break; - } - } - } - - /* - * If this isn't the last console and it has CON_CONSDEV set, we - * need to set it on the next preferred console. - */ - if (console_drivers != NULL && console->flags & CON_CONSDEV) - console_drivers->flags |= CON_CONSDEV; - - release_console_sem(); - return res; -} -EXPORT_SYMBOL(unregister_console); - -static int __init disable_boot_consoles(void) -{ - if (console_drivers != NULL) { - if (console_drivers->flags & CON_BOOT) { - printk(KERN_INFO "turn off boot console %s%d\n", - console_drivers->name, console_drivers->index); - return unregister_console(console_drivers); - } - } - return 0; -} -late_initcall(disable_boot_consoles); - -/** - * tty_write_message - write a message to a certain tty, not just the console. - * @tty: the destination tty_struct - * @msg: the message to write - * - * This is used for messages that need to be redirected to a specific tty. - * We don't put it into the syslog queue right now maybe in the future if - * really needed. - */ -void tty_write_message(struct tty_struct *tty, char *msg) -{ - if (tty && tty->ops->write) - tty->ops->write(tty, msg, strlen(msg)); - return; -} - -#if defined CONFIG_PRINTK - -/* - * printk rate limiting, lifted from the networking subsystem. - * - * This enforces a rate limit: not more than 10 kernel messages - * every 5s to make a denial-of-service attack impossible. - */ -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); - -int printk_ratelimit(void) -{ - return __ratelimit(&printk_ratelimit_state); -} -EXPORT_SYMBOL(printk_ratelimit); - -/** - * printk_timed_ratelimit - caller-controlled printk ratelimiting - * @caller_jiffies: pointer to caller's state - * @interval_msecs: minimum interval between prints - * - * printk_timed_ratelimit() returns true if more than @interval_msecs - * milliseconds have elapsed since the last time printk_timed_ratelimit() - * returned true. - */ -bool printk_timed_ratelimit(unsigned long *caller_jiffies, - unsigned int interval_msecs) -{ - if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { - *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); - return true; - } - return false; -} -EXPORT_SYMBOL(printk_timed_ratelimit); -#endif -/* - * linux/kernel/profile.c - * Simple profiling. Manages a direct-mapped profile hit count buffer, - * with configurable resolution, support for restricting the cpus on - * which profiling is done, and switching between cpu time and - * schedule() calls via kernel command line parameters passed at boot. - * - * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, - * Red Hat, July 2004 - * Consolidation of architecture support code for profiling, - * William Irwin, Oracle, July 2004 - * Amortized hit count accounting via per-cpu open-addressed hashtables - * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct profile_hit { - u32 pc, hits; -}; -#define PROFILE_GRPSHIFT 3 -#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) -#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) -#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) - -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - -static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; - -int prof_on __read_mostly; -EXPORT_SYMBOL_GPL(prof_on); - -static cpumask_t prof_cpu_mask = CPU_MASK_ALL; -#ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - -static int __init profile_setup(char *str) -{ - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; - int par; - - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", - prof_shift); -#else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { - prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", - prof_shift); - } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { - prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", - prof_shift); - } else if (get_option(&str, &par)) { - prof_shift = par; - prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", - prof_shift); - } - return 1; -} -__setup("profile=", profile_setup); - - -void __init profile_init(void) -{ - if (!prof_on) - return; - - /* only text is profiled */ - prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); -} - -/* Profile event notifications */ - -static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); -static ATOMIC_NOTIFIER_HEAD(task_free_notifier); -static BLOCKING_NOTIFIER_HEAD(munmap_notifier); - -void profile_task_exit(struct task_struct *task) -{ - blocking_notifier_call_chain(&task_exit_notifier, 0, task); -} - -int profile_handoff_task(struct task_struct *task) -{ - int ret; - ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); - return (ret == NOTIFY_OK) ? 1 : 0; -} - -void profile_munmap(unsigned long addr) -{ - blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); -} - -int task_handoff_register(struct notifier_block *n) -{ - return atomic_notifier_chain_register(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_register); - -int task_handoff_unregister(struct notifier_block *n) -{ - return atomic_notifier_chain_unregister(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_unregister); - -int profile_event_register(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_register( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_register( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_register); - -int profile_event_unregister(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_unregister( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_unregister( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_unregister); - -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - -#ifdef CONFIG_SMP -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- wli - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -void profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - if (prof_on != type || !prof_buffer) - return; - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int __devinit profile_cpu_callback(struct notifier_block *info, - unsigned long action, void *__cpu) -{ - int node, cpu = (unsigned long)__cpu; - struct page *page; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - node = cpu_to_node(cpu); - per_cpu(cpu_profile_flip, cpu) = 0; - if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - return NOTIFY_BAD; - per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); - } - if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - goto out_free; - per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); - } - break; -out_free: - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - return NOTIFY_BAD; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cpu_set(cpu, prof_cpu_mask); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpu_clear(cpu, prof_cpu_mask); - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - break; - } - return NOTIFY_OK; -} -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) -#define profile_cpu_callback NULL - -void profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long pc; - - if (prof_on != type || !prof_buffer) - return; - pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); -} -#endif /* !CONFIG_SMP */ -EXPORT_SYMBOL_GPL(profile_hits); - -void profile_tick(int type) -{ - struct pt_regs *regs = get_irq_regs(); - - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) - profile_hit(type, (void *)profile_pc(regs)); -} - -#ifdef CONFIG_PROC_FS -#include -#include -#include - -static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; -} - -static int prof_cpu_mask_write_proc(struct file *file, - const char __user *buffer, unsigned long count, void *data) -{ - cpumask_t *mask = (cpumask_t *)data; - unsigned long full_count = count, err; - cpumask_t new_value; - - err = cpumask_parse_user(buffer, count, new_value); - if (err) - return err; - - *mask = new_value; - return full_count; -} - -void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) -{ - struct proc_dir_entry *entry; - - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - if (!entry) - return; - entry->data = (void *)&prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; -} - -/* - * This function accesses profiling information. The returned data is - * binary: the sampling step and the actual contents of the profile - * buffer. Use of the program readprofile is recommended in order to - * get meaningful info out of these data. - */ -static ssize_t -read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t read; - char *pnt; - unsigned int sample_step = 1 << prof_shift; - - profile_flip_buffers(); - if (p >= (prof_len+1)*sizeof(unsigned int)) - return 0; - if (count > (prof_len+1)*sizeof(unsigned int) - p) - count = (prof_len+1)*sizeof(unsigned int) - p; - read = 0; - - while (p < sizeof(unsigned int) && count > 0) { - if (put_user(*((char *)(&sample_step)+p), buf)) - return -EFAULT; - buf++; p++; count--; read++; - } - pnt = (char *)prof_buffer + p - sizeof(atomic_t); - if (copy_to_user(buf, (void *)pnt, count)) - return -EFAULT; - read += count; - *ppos += read; - return read; -} - -/* - * Writing to /proc/profile resets the counters - * - * Writing a 'profiling multiplier' value into it also re-sets the profiling - * interrupt frequency, on architectures that support this. - */ -static ssize_t write_profile(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ -#ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - - if (count == sizeof(int)) { - unsigned int multiplier; - - if (copy_from_user(&multiplier, buf, sizeof(int))) - return -EFAULT; - - if (setup_profiling_timer(multiplier)) - return -EINVAL; - } -#endif - profile_discard_flip_buffers(); - memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); - return count; -} - -static const struct file_operations proc_profile_operations = { - .read = read_profile, - .write = write_profile, -}; - -#ifdef CONFIG_SMP -static void __init profile_nop(void *unused) -{ -} - -static int __init create_hash_tables(void) -{ - int cpu; - - for_each_online_cpu(cpu) { - int node = cpu_to_node(cpu); - struct page *page; - - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[1] - = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[0] - = (struct profile_hit *)page_address(page); - } - return 0; -out_cleanup: - prof_on = 0; - smp_mb(); - on_each_cpu(profile_nop, NULL, 1); - for_each_online_cpu(cpu) { - struct page *page; - - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - } - return -1; -} -#else -#define create_hash_tables() ({ 0; }) -#endif - -static int __init create_proc_profile(void) -{ - struct proc_dir_entry *entry; - - if (!prof_on) - return 0; - if (create_hash_tables()) - return -1; - entry = proc_create("profile", S_IWUSR | S_IRUGO, - NULL, &proc_profile_operations); - if (!entry) - return 0; - entry->size = (1+prof_len) * sizeof(atomic_t); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; -} -module_init(create_proc_profile); -#endif /* CONFIG_PROC_FS */ -/* - * linux/kernel/ptrace.c - * - * (C) Copyright 1999 Linus Torvalds - * - * Common interfaces for "ptrace()" which we do not want - * to continually duplicate across every architecture. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * ptrace a task: make the debugger its new parent and - * move it to the ptrace list. - * - * Must be called with the tasklist lock write-held. - */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) -{ - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; -} - -/* - * Turn a tracing stop into a normal stop now, since with no tracer there - * would be no way to wake it up with SIGCONT or SIGKILL. If there was a - * signal sent that would resume the child, but didn't because it was in - * TASK_TRACED, resume it now. - * Requires that irqs be disabled. - */ -void ptrace_untrace(struct task_struct *child) -{ - spin_lock(&child->sighand->siglock); - if (task_is_traced(child)) { - if (child->signal->flags & SIGNAL_STOP_STOPPED) { - __set_task_state(child, TASK_STOPPED); - } else { - signal_wake_up(child, 1); - } - } - spin_unlock(&child->sighand->siglock); -} - -/* - * unptrace a task: move it back to its original parent and - * remove it from the ptrace list. - * - * Must be called with the tasklist lock write-held. - */ -void __ptrace_unlink(struct task_struct *child) -{ - BUG_ON(!child->ptrace); - - child->ptrace = 0; - child->parent = child->real_parent; - list_del_init(&child->ptrace_entry); - - if (task_is_traced(child)) - ptrace_untrace(child); -} - -/* - * Check that we have indeed attached to the thing.. - */ -int ptrace_check_attach(struct task_struct *child, int kill) -{ - int ret = -ESRCH; - - /* - * We take the read lock around doing both checks to close a - * possible race where someone else was tracing our child and - * detached between these two checks. After this locked check, - * we are sure that this is our traced child and that can only - * be changed by us so it's not changing right after this. - */ - read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { - ret = 0; - /* - * child->sighand can't be NULL, release_task() - * does ptrace_unlink() before __exit_signal(). - */ - spin_lock_irq(&child->sighand->siglock); - if (task_is_stopped(child)) - child->state = TASK_TRACED; - else if (!task_is_traced(child) && !kill) - ret = -ESRCH; - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!ret && !kill) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; - - /* All systems go.. */ - return ret; -} - -int __ptrace_may_access(struct task_struct *task, unsigned int mode) -{ - /* May we inspect the given task? - * This check is used both for attaching with ptrace - * and for allowing access to sensitive information in /proc. - * - * ptrace_attach denies several cases that /proc allows - * because setting up the necessary parent/child relationship - * or halting the specified task is impossible. - */ - int dumpable = 0; - /* Don't let security modules deny introspection */ - if (task == current) - return 0; - if (((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) - return -EPERM; - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) - return -EPERM; - - return security_ptrace_may_access(task, mode); -} - -bool ptrace_may_access(struct task_struct *task, unsigned int mode) -{ - int err; - task_lock(task); - err = __ptrace_may_access(task, mode); - task_unlock(task); - return (!err ? true : false); -} - -int ptrace_attach(struct task_struct *task) -{ - int retval; - unsigned long flags; - - audit_ptrace(task); - - retval = -EPERM; - if (same_thread_group(task, current)) - goto out; - -repeat: - /* - * Nasty, nasty. - * - * We want to hold both the task-lock and the - * tasklist_lock for writing at the same time. - * But that's against the rules (tasklist_lock - * is taken for reading by interrupts on other - * cpu's that may have task_lock). - */ - task_lock(task); - if (!write_trylock_irqsave(&tasklist_lock, flags)) { - task_unlock(task); - do { - cpu_relax(); - } while (!write_can_lock(&tasklist_lock)); - goto repeat; - } - - if (!task->mm) - goto bad; - /* the same process cannot be attached many times */ - if (task->ptrace & PT_PTRACED) - goto bad; - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); - if (retval) - goto bad; - - /* Go */ - task->ptrace |= PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; - - __ptrace_link(task, current); - - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); -bad: - write_unlock_irqrestore(&tasklist_lock, flags); - task_unlock(task); -out: - return retval; -} - -static inline void __ptrace_detach(struct task_struct *child, unsigned int data) -{ - child->exit_code = data; - /* .. re-parent .. */ - __ptrace_unlink(child); - /* .. and wake it up. */ - if (child->exit_state != EXIT_ZOMBIE) - wake_up_state(child, TASK_TRACED | TASK_STOPPED); -} - -int ptrace_detach(struct task_struct *child, unsigned int data) -{ - if (!valid_signal(data)) - return -EIO; - - /* Architecture-specific hardware disable .. */ - ptrace_disable(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - - write_lock_irq(&tasklist_lock); - /* protect against de_thread()->release_task() */ - if (child->ptrace) - __ptrace_detach(child, data); - write_unlock_irq(&tasklist_lock); - - return 0; -} - -int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) -{ - int copied = 0; - - while (len > 0) { - char buf[128]; - int this_len, retval; - - this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); - if (!retval) { - if (copied) - break; - return -EIO; - } - if (copy_to_user(dst, buf, retval)) - return -EFAULT; - copied += retval; - src += retval; - dst += retval; - len -= retval; - } - return copied; -} - -int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) -{ - int copied = 0; - - while (len > 0) { - char buf[128]; - int this_len, retval; - - this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - if (copy_from_user(buf, src, this_len)) - return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); - if (!retval) { - if (copied) - break; - return -EIO; - } - copied += retval; - src += retval; - dst += retval; - len -= retval; - } - return copied; -} - -static int ptrace_setoptions(struct task_struct *child, long data) -{ - child->ptrace &= ~PT_TRACE_MASK; - - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; - - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; - - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; -} - -static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) -{ - int error = -ESRCH; - - read_lock(&tasklist_lock); - if (likely(child->sighand != NULL)) { - error = -EINVAL; - spin_lock_irq(&child->sighand->siglock); - if (likely(child->last_siginfo != NULL)) { - *info = *child->last_siginfo; - error = 0; - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - return error; -} - -static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) -{ - int error = -ESRCH; - - read_lock(&tasklist_lock); - if (likely(child->sighand != NULL)) { - error = -EINVAL; - spin_lock_irq(&child->sighand->siglock); - if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = *info; - error = 0; - } - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - return error; -} - - -#ifdef PTRACE_SINGLESTEP -#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif - -#ifdef PTRACE_SINGLEBLOCK -#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) -#else -#define is_singleblock(request) 0 -#endif - -#ifdef PTRACE_SYSEMU -#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) -#else -#define is_sysemu_singlestep(request) 0 -#endif - -static int ptrace_resume(struct task_struct *child, long request, long data) -{ - if (!valid_signal(data)) - return -EIO; - - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - -#ifdef TIF_SYSCALL_EMU - if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -#endif - - if (is_singleblock(request)) { - if (unlikely(!arch_has_block_step())) - return -EIO; - user_enable_block_step(child); - } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { - if (unlikely(!arch_has_single_step())) - return -EIO; - user_enable_single_step(child); - } - else - user_disable_single_step(child); - - child->exit_code = data; - wake_up_process(child); - - return 0; -} - -int ptrace_request(struct task_struct *child, long request, - long addr, long data) -{ - int ret = -EIO; - siginfo_t siginfo; - - switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - return generic_ptrace_peekdata(child, addr, data); - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - return generic_ptrace_pokedata(child, addr, data); - -#ifdef PTRACE_OLDSETOPTIONS - case PTRACE_OLDSETOPTIONS: -#endif - case PTRACE_SETOPTIONS: - ret = ptrace_setoptions(child, data); - break; - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned long __user *) data); - break; - - case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, &siginfo); - if (!ret) - ret = copy_siginfo_to_user((siginfo_t __user *) data, - &siginfo); - break; - - case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, (siginfo_t __user *) data, - sizeof siginfo)) - ret = -EFAULT; - else - ret = ptrace_setsiginfo(child, &siginfo); - break; - - case PTRACE_DETACH: /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - -#ifdef PTRACE_SINGLESTEP - case PTRACE_SINGLESTEP: -#endif -#ifdef PTRACE_SINGLEBLOCK - case PTRACE_SINGLEBLOCK: -#endif -#ifdef PTRACE_SYSEMU - case PTRACE_SYSEMU: - case PTRACE_SYSEMU_SINGLESTEP: -#endif - case PTRACE_SYSCALL: - case PTRACE_CONT: - return ptrace_resume(child, request, data); - - case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); - - default: - break; - } - - return ret; -} - -/** - * ptrace_traceme -- helper for PTRACE_TRACEME - * - * Performs checks and sets PT_PTRACED. - * Should be used by all ptrace implementations for PTRACE_TRACEME. - */ -int ptrace_traceme(void) -{ - int ret = -EPERM; - - /* - * Are we already being traced? - */ -repeat: - task_lock(current); - if (!(current->ptrace & PT_PTRACED)) { - /* - * See ptrace_attach() comments about the locking here. - */ - unsigned long flags; - if (!write_trylock_irqsave(&tasklist_lock, flags)) { - task_unlock(current); - do { - cpu_relax(); - } while (!write_can_lock(&tasklist_lock)); - goto repeat; - } - - ret = security_ptrace_traceme(current->parent); - - /* - * Set the ptrace bit in the process ptrace flags. - * Then link us on our parent's ptraced list. - */ - if (!ret) { - current->ptrace |= PT_PTRACED; - __ptrace_link(current, current->real_parent); - } - - write_unlock_irqrestore(&tasklist_lock, flags); - } - task_unlock(current); - return ret; -} - -/** - * ptrace_get_task_struct -- grab a task struct reference for ptrace - * @pid: process id to grab a task_struct reference of - * - * This function is a helper for ptrace implementations. It checks - * permissions and then grabs a task struct for use of the actual - * ptrace implementation. - * - * Returns the task_struct for @pid or an ERR_PTR() on failure. - */ -struct task_struct *ptrace_get_task_struct(pid_t pid) -{ - struct task_struct *child; - - read_lock(&tasklist_lock); - child = find_task_by_vpid(pid); - if (child) - get_task_struct(child); - - read_unlock(&tasklist_lock); - if (!child) - return ERR_PTR(-ESRCH); - return child; -} - -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - -SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) -{ - struct task_struct *child; - long ret; - - /* - * This lock_kernel fixes a subtle race with suid exec - */ - lock_kernel(); - if (request == PTRACE_TRACEME) { - ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); - goto out; - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); - goto out_put_task_struct; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_put_task_struct; - - ret = arch_ptrace(child, request, addr, data); - if (ret < 0) - goto out_put_task_struct; - - out_put_task_struct: - put_task_struct(child); - out: - unlock_kernel(); - return ret; -} - -int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) -{ - unsigned long tmp; - int copied; - - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); - if (copied != sizeof(tmp)) - return -EIO; - return put_user(tmp, (unsigned long __user *)data); -} - -int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) -{ - int copied; - - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); - return (copied == sizeof(data)) ? 0 : -EIO; -} - -#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE -#include - -int compat_ptrace_request(struct task_struct *child, compat_long_t request, - compat_ulong_t addr, compat_ulong_t data) -{ - compat_ulong_t __user *datap = compat_ptr(data); - compat_ulong_t word; - siginfo_t siginfo; - int ret; - - switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); - if (ret != sizeof(word)) - ret = -EIO; - else - ret = put_user(word, datap); - break; - - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); - ret = (ret != sizeof(data) ? -EIO : 0); - break; - - case PTRACE_GETEVENTMSG: - ret = put_user((compat_ulong_t) child->ptrace_message, datap); - break; - - case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, &siginfo); - if (!ret) - ret = copy_siginfo_to_user32( - (struct compat_siginfo __user *) datap, - &siginfo); - break; - - case PTRACE_SETSIGINFO: - memset(&siginfo, 0, sizeof siginfo); - if (copy_siginfo_from_user32( - &siginfo, (struct compat_siginfo __user *) datap)) - ret = -EFAULT; - else - ret = ptrace_setsiginfo(child, &siginfo); - break; - - default: - ret = ptrace_request(child, request, addr, data); - } - - return ret; -} - -asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data) -{ - struct task_struct *child; - long ret; - - /* - * This lock_kernel fixes a subtle race with suid exec - */ - lock_kernel(); - if (request == PTRACE_TRACEME) { - ret = ptrace_traceme(); - goto out; - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); - goto out_put_task_struct; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (!ret) - ret = compat_arch_ptrace(child, request, addr, data); - - out_put_task_struct: - put_task_struct(child); - out: - unlock_kernel(); - return ret; -} -#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, -}; -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, -}; - -DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; - -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - cpumask_t cpumask; - set_need_resched(); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - * - * cpu_online_map is updated by the _cpu_down() - * using __stop_machine(). Since we're in irqs disabled - * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. - * - * However, a cpu might have been offlined _just_ before - * we disabled irqs while entering here. - * And rcu subsystem might not yet have handled the CPU_DEAD - * notification, leading to the offlined cpu's bit - * being set in the rcp->cpumask. - * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent - * sending smp_reschedule() to an offlined CPU. - */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); - } -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - set_need_resched(); -} -#endif - -/** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* Raises the softirq for processing rcu_callbacks. */ -static inline void raise_rcu_softirq(void) -{ - raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); -} - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_disable(); - rdp->qlen -= count; - local_irq_enable(); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - raise_rcu_softirq(); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->next_pending && - rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); - rcp->cur++; - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock(&rcp->lock); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock(&rcp->lock); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) -{ - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_bh(&rcp->lock); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); - - local_irq_disable(); - this_rdp->qlen += rdp->qlen; - local_irq_enable(); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from softirq context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); - - /* - * start the next batch of callbacks - */ - - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - - if (!rcp->next_pending) { - /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; - - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); -} - -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. - * - * Also do a memory barrier. This is needed to handle - * the case where writes from a preempt-disable section - * of code get reordered into schedule() by this CPU's - * write buffer. The memory barrier makes sure that - * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see - * by other CPUs to happen after any such write. - */ - - smp_mb(); /* See above block comment. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. The memory barrier - * is needed for the same reason as is the above one. - */ - - smp_mb(); /* See above block comment. */ - rcu_bh_qsctr_inc(cpu); - } - raise_rcu_softirq(); -} - -static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; -} - -static void __cpuinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init __rcu_init(void) -{ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -enum rcu_barrier { - RCU_BARRIER_STD, - RCU_BARRIER_BH, - RCU_BARRIER_SCHED, -}; - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - -/* - * Awaken the corresponding synchronize_rcu() instance now that a - * grace period has elapsed. - */ -void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void); /* Makes kernel-doc tools happy */ -synchronize_rcu_xxx(synchronize_rcu, call_rcu) -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - - atomic_inc(&rcu_barrier_cpu_count); - switch ((enum rcu_barrier)type) { - case RCU_BARRIER_STD: - call_rcu(head, rcu_barrier_callback); - break; - case RCU_BARRIER_BH: - call_rcu_bh(head, rcu_barrier_callback); - break; - case RCU_BARRIER_SCHED: - call_rcu_sched(head, rcu_barrier_callback); - break; - } -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(enum rcu_barrier type) -{ - BUG_ON(in_interrupt()); - /* Take cpucontrol mutex to protect against CPU hotplug */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 0); - /* - * The queueing of callbacks in all CPUs must be atomic with - * respect to RCU, otherwise one CPU may queue a callback, - * wait for a grace period, decrement barrier count and call - * complete(), while other CPUs have not yet queued anything. - * So, we need to make sure that grace periods cannot complete - * until all the callbacks are queued. - */ - rcu_read_lock(); - on_each_cpu(rcu_barrier_func, (void *)type, 1); - rcu_read_unlock(); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); -} - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - */ -void rcu_barrier(void) -{ - _rcu_barrier(RCU_BARRIER_STD); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(RCU_BARRIER_BH); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(RCU_BARRIER_SCHED); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -void __init rcu_init(void) -{ - __rcu_init(); -} - -/* - * Read-Copy Update mechanism for mutual exclusion, realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Authors: Paul E. McKenney - * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar - * for pushing me away from locks and towards counters, and - * to Suparna Bhattacharya for pushing me completely away - * from atomic instructions on the read side. - * - * - Added handling of Dynamic Ticks - * Copyright 2007 - Paul E. Mckenney - * - Steven Rostedt - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * Design Document: http://lwn.net/Articles/253651/ - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* - * PREEMPT_RCU data structures. - */ - -/* - * GP_STAGES specifies the number of times the state machine has - * to go through the all the rcu_try_flip_states (see below) - * in a single Grace Period. - * - * GP in GP_STAGES stands for Grace Period ;) - */ -#define GP_STAGES 2 -struct rcu_data { - spinlock_t lock; /* Protect rcu_data fields. */ - long completed; /* Number of last completed batch. */ - int waitlistcount; - struct rcu_head *nextlist; - struct rcu_head **nexttail; - struct rcu_head *waitlist[GP_STAGES]; - struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; /* from waitlist & waitschedlist */ - struct rcu_head **donetail; - long rcu_flipctr[2]; - struct rcu_head *nextschedlist; - struct rcu_head **nextschedtail; - struct rcu_head *waitschedlist; - struct rcu_head **waitschedtail; - int rcu_sched_sleeping; -#ifdef CONFIG_RCU_TRACE - struct rcupreempt_trace trace; -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -/* - * States for rcu_try_flip() and friends. - */ - -enum rcu_try_flip_states { - - /* - * Stay here if nothing is happening. Flip the counter if somthing - * starts happening. Denoted by "I" - */ - rcu_try_flip_idle_state, - - /* - * Wait here for all CPUs to notice that the counter has flipped. This - * prevents the old set of counters from ever being incremented once - * we leave this state, which in turn is necessary because we cannot - * test any individual counter for zero -- we can only check the sum. - * Denoted by "A". - */ - rcu_try_flip_waitack_state, - - /* - * Wait here for the sum of the old per-CPU counters to reach zero. - * Denoted by "Z". - */ - rcu_try_flip_waitzero_state, - - /* - * Wait here for each of the other CPUs to execute a memory barrier. - * This is necessary to ensure that these other CPUs really have - * completed executing their RCU read-side critical sections, despite - * their CPUs wildly reordering memory. Denoted by "M". - */ - rcu_try_flip_waitmb_state, -}; - -/* - * States for rcu_ctrlblk.rcu_sched_sleep. - */ - -enum rcu_sched_sleep_states { - rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ - rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ - rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ -}; - -struct rcu_ctrlblk { - spinlock_t fliplock; /* Protect state-machine transitions. */ - long completed; /* Number of last completed batch. */ - enum rcu_try_flip_states rcu_try_flip_state; /* The current state of - the rcu state machine */ - spinlock_t schedlock; /* Protect rcu_sched sleep state. */ - enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ - wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ -}; - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); -static struct rcu_ctrlblk rcu_ctrlblk = { - .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), - .completed = 0, - .rcu_try_flip_state = rcu_try_flip_idle_state, - .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), - .sched_sleep = rcu_sched_not_sleeping, - .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), -}; - -static struct task_struct *rcu_sched_grace_period_task; - -#ifdef CONFIG_RCU_TRACE -static char *rcu_try_flip_state_names[] = - { "idle", "waitack", "waitzero", "waitmb" }; -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; - -/* - * Enum and per-CPU flag to determine when each CPU has seen - * the most recent counter flip. - */ - -enum rcu_flip_flag_values { - rcu_flip_seen, /* Steady/initial state, last flip seen. */ - /* Only GP detector can update. */ - rcu_flipped /* Flip just completed, need confirmation. */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) - = rcu_flip_seen; - -/* - * Enum and per-CPU flag to determine when each CPU has executed the - * needed memory barrier to fence in memory references from its last RCU - * read-side critical section in the just-completed grace period. - */ - -enum rcu_mb_flag_values { - rcu_mb_done, /* Steady/initial state, no mb()s required. */ - /* Only GP detector can update. */ - rcu_mb_needed /* Flip just completed, need an mb(). */ - /* Only corresponding CPU can update. */ -}; -static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) - = rcu_mb_done; - -/* - * RCU_DATA_ME: find the current CPU's rcu_data structure. - * RCU_DATA_CPU: find the specified CPU's rcu_data structure. - */ -#define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) -#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable, but where the CPU number is so cached. - */ -#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is not - * cached in a local variable. - */ -#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); - -/* - * Helper macro for tracing when the appropriate rcu_data is pointed - * to by a local variable. - */ -#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); - -#define RCU_SCHED_BATCH_TIME (HZ / 50) - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -void __rcu_read_lock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting != 0) { - - /* An earlier rcu_read_lock() covers us, just count it. */ - - t->rcu_read_lock_nesting = nesting + 1; - - } else { - unsigned long flags; - - /* - * We disable interrupts for the following reasons: - * - If we get scheduling clock interrupt here, and we - * end up acking the counter flip, it's like a promise - * that we will never increment the old counter again. - * Thus we will break that promise if that - * scheduling clock interrupt happens between the time - * we pick the .completed field and the time that we - * increment our counter. - * - * - We don't want to be preempted out here. - * - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_lock(), so increment - * the current counter for the current CPU. Use volatile - * casts to prevent the compiler from reordering. - */ - - idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; - - /* - * Now that the per-CPU counter has been incremented, we - * are protected from races with rcu_read_lock() invoked - * from NMI handlers on this CPU. We can therefore safely - * increment the nesting counter, relieving further NMIs - * of the need to increment the per-CPU counter. - */ - - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; - - /* - * Now that we have preventing any NMIs from storing - * to the ->rcu_flipctr_idx, we can safely use it to - * remember which counter to decrement in the matching - * rcu_read_unlock(). - */ - - ACCESS_ONCE(t->rcu_flipctr_idx) = idx; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -void __rcu_read_unlock(void) -{ - int idx; - struct task_struct *t = current; - int nesting; - - nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); - if (nesting > 1) { - - /* - * We are still protected by the enclosing rcu_read_lock(), - * so simply decrement the counter. - */ - - t->rcu_read_lock_nesting = nesting - 1; - - } else { - unsigned long flags; - - /* - * Disable local interrupts to prevent the grace-period - * detection state machine from seeing us half-done. - * NMIs can still occur, of course, and might themselves - * contain rcu_read_lock() and rcu_read_unlock(). - */ - - local_irq_save(flags); - - /* - * Outermost nesting of rcu_read_unlock(), so we must - * decrement the current counter for the current CPU. - * This must be done carefully, because NMIs can - * occur at any point in this code, and any rcu_read_lock() - * and rcu_read_unlock() pairs in the NMI handlers - * must interact non-destructively with this code. - * Lots of volatile casts, and -very- careful ordering. - * - * Changes to this code, including this one, must be - * inspected, validated, and tested extremely carefully!!! - */ - - /* - * First, pick up the index. - */ - - idx = ACCESS_ONCE(t->rcu_flipctr_idx); - - /* - * Now that we have fetched the counter index, it is - * safe to decrement the per-task RCU nesting counter. - * After this, any interrupts or NMIs will increment and - * decrement the per-CPU counters. - */ - ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; - - /* - * It is now safe to decrement this task's nesting count. - * NMIs that occur after this statement will route their - * rcu_read_lock() calls through this "else" clause, and - * will thus start incrementing the per-CPU counter on - * their own. They will also clobber ->rcu_flipctr_idx, - * but that is OK, since we have already fetched it. - */ - - ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; - local_irq_restore(flags); - } -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -/* - * If a global counter flip has occurred since the last time that we - * advanced callbacks, advance them. Hardware interrupts must be - * disabled when calling this function. - */ -static void __rcu_advance_callbacks(struct rcu_data *rdp) -{ - int cpu; - int i; - int wlc = 0; - - if (rdp->completed != rcu_ctrlblk.completed) { - if (rdp->waitlist[GP_STAGES - 1] != NULL) { - *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; - rdp->donetail = rdp->waittail[GP_STAGES - 1]; - RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); - } - for (i = GP_STAGES - 2; i >= 0; i--) { - if (rdp->waitlist[i] != NULL) { - rdp->waitlist[i + 1] = rdp->waitlist[i]; - rdp->waittail[i + 1] = rdp->waittail[i]; - wlc++; - } else { - rdp->waitlist[i + 1] = NULL; - rdp->waittail[i + 1] = - &rdp->waitlist[i + 1]; - } - } - if (rdp->nextlist != NULL) { - rdp->waitlist[0] = rdp->nextlist; - rdp->waittail[0] = rdp->nexttail; - wlc++; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); - } else { - rdp->waitlist[0] = NULL; - rdp->waittail[0] = &rdp->waitlist[0]; - } - rdp->waitlistcount = wlc; - rdp->completed = rcu_ctrlblk.completed; - } - - /* - * Check to see if this CPU needs to report that it has seen - * the most recent counter flip, thereby declaring that all - * subsequent rcu_read_lock() invocations will respect this flip. - */ - - cpu = raw_smp_processor_id(); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } -} - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { - .dynticks = 1, -}; - -#ifdef CONFIG_NO_HZ -static DEFINE_PER_CPU(int, rcu_update_flag); - -/** - * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. - * - * If the CPU was idle with dynamic ticks active, this updates the - * rcu_dyntick_sched.dynticks to let the RCU handling know that the - * CPU is active. - */ -void rcu_irq_enter(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - if (per_cpu(rcu_update_flag, cpu)) - per_cpu(rcu_update_flag, cpu)++; - - /* - * Only update if we are coming from a stopped ticks mode - * (rcu_dyntick_sched.dynticks is even). - */ - if (!in_interrupt() && - (rdssp->dynticks & 0x1) == 0) { - /* - * The following might seem like we could have a race - * with NMI/SMIs. But this really isn't a problem. - * Here we do a read/modify/write, and the race happens - * when an NMI/SMI comes in after the read and before - * the write. But NMI/SMIs will increment this counter - * twice before returning, so the zero bit will not - * be corrupted by the NMI/SMI which is the most important - * part. - * - * The only thing is that we would bring back the counter - * to a postion that it was in during the NMI/SMI. - * But the zero bit would be set, so the rest of the - * counter would again be ignored. - * - * On return from the IRQ, the counter may have the zero - * bit be 0 and the counter the same as the return from - * the NMI/SMI. If the state machine was so unlucky to - * see that, it still doesn't matter, since all - * RCU read-side critical sections on this CPU would - * have already completed. - */ - rdssp->dynticks++; - /* - * The following memory barrier ensures that any - * rcu_read_lock() primitives in the irq handler - * are seen by other CPUs to follow the above - * increment to rcu_dyntick_sched.dynticks. This is - * required in order for other CPUs to correctly - * determine when it is safe to advance the RCU - * grace-period state machine. - */ - smp_mb(); /* see above block comment. */ - /* - * Since we can't determine the dynamic tick mode from - * the rcu_dyntick_sched.dynticks after this routine, - * we use a second flag to acknowledge that we came - * from an idle state with ticks stopped. - */ - per_cpu(rcu_update_flag, cpu)++; - /* - * If we take an NMI/SMI now, they will also increment - * the rcu_update_flag, and will not update the - * rcu_dyntick_sched.dynticks on exit. That is for - * this IRQ to do. - */ - } -} - -/** - * rcu_irq_exit - Called from exiting Hard irq context. - * - * If the CPU was idle with dynamic ticks active, update the - * rcu_dyntick_sched.dynticks to put let the RCU handling be - * aware that the CPU is going back to idle with no ticks. - */ -void rcu_irq_exit(void) -{ - int cpu = smp_processor_id(); - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * rcu_update_flag is set if we interrupted the CPU - * when it was idle with ticks stopped. - * Once this occurs, we keep track of interrupt nesting - * because a NMI/SMI could also come in, and we still - * only want the IRQ that started the increment of the - * rcu_dyntick_sched.dynticks to be the one that modifies - * it on exit. - */ - if (per_cpu(rcu_update_flag, cpu)) { - if (--per_cpu(rcu_update_flag, cpu)) - return; - - /* This must match the interrupt nesting */ - WARN_ON(in_interrupt()); - - /* - * If an NMI/SMI happens now we are still - * protected by the rcu_dyntick_sched.dynticks being odd. - */ - - /* - * The following memory barrier ensures that any - * rcu_read_unlock() primitives in the irq handler - * are seen by other CPUs to preceed the following - * increment to rcu_dyntick_sched.dynticks. This - * is required in order for other CPUs to determine - * when it is safe to advance the RCU grace-period - * state machine. - */ - smp_mb(); /* see above block comment. */ - rdssp->dynticks++; - WARN_ON(rdssp->dynticks & 0x1); - } -} - -static void dyntick_save_progress_counter(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->dynticks_snap = rdssp->dynticks; -} - -static inline int -rcu_try_flip_waitack_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. So we can safely pretend that this CPU - * already acknowledged the counter. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, we can safely pretend - * that this CPU already acknowledged the counter. - */ - - if ((curr - snap) > 2 || (curr & 0x1) == 0) - return 0; - - /* We need this CPU to explicitly acknowledge the counter flip. */ - - return 1; -} - -static inline int -rcu_try_flip_waitmb_needed(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot have executed an RCU read-side critical section - * during that time, so there is no need for it to execute a - * memory barrier. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU either entered or exited an outermost interrupt, - * SMI, NMI, or whatever handler, then we know that it executed - * a memory barrier when doing so. So we don't need another one. - */ - if (curr != snap) - return 0; - - /* We need the CPU to execute a memory barrier. */ - - return 1; -} - -static void dyntick_save_progress_counter_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_dynticks_snap = rdssp->dynticks; -} - -static int rcu_qsctr_inc_needed_dyntick(int cpu) -{ - long curr; - long snap; - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - curr = rdssp->dynticks; - snap = rdssp->sched_dynticks_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ - - /* - * If the CPU remained in dynticks mode for the entire time - * and didn't take any interrupts, NMIs, SMIs, or whatever, - * then it cannot be in the middle of an rcu_read_lock(), so - * the next rcu_read_lock() it executes must use the new value - * of the counter. Therefore, this CPU has been in a quiescent - * state the entire time, and we don't need to wait for it. - */ - - if ((curr == snap) && ((curr & 0x1) == 0)) - return 0; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq handlers, then, as above, this CPU has already - * passed through a quiescent state. - */ - - if ((curr - snap) > 2 || (snap & 0x1) == 0) - return 0; - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -#else /* !CONFIG_NO_HZ */ - -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) - -# define dyntick_save_progress_counter_sched(cpu) do { } while (0) -# define rcu_qsctr_inc_needed_dyntick(cpu) (1) - -#endif /* CONFIG_NO_HZ */ - -static void save_qsctr_sched(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - rdssp->sched_qs_snap = rdssp->sched_qs; -} - -static inline int rcu_qsctr_inc_needed(int cpu) -{ - struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - - /* - * If there has been a quiescent state, no more need to wait - * on this CPU. - */ - - if (rdssp->sched_qs != rdssp->sched_qs_snap) { - smp_mb(); /* force ordering with cpu entering schedule(). */ - return 0; - } - - /* We need this CPU to go through a quiescent state. */ - - return 1; -} - -/* - * Get here when RCU is idle. Decide whether we need to - * move out of idle state, and return non-zero if so. - * "Straightforward" approach for the moment, might later - * use callback-list lengths, grace-period duration, or - * some such to determine when to exit idle state. - * Might also need a pre-idle test that does not acquire - * the lock, but let's get the simple case working first... - */ - -static int -rcu_try_flip_idle(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); - if (!rcu_pending(smp_processor_id())) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); - return 0; - } - - /* - * Do the flip. - */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); - rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ - - /* - * Need a memory barrier so that other CPUs see the new - * counter value before they see the subsequent change of all - * the rcu_flip_flag instances to rcu_flipped. - */ - - smp_mb(); /* see above block comment. */ - - /* Now ask each CPU for acknowledgement of the flip. */ - - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { - per_cpu(rcu_flip_flag, cpu) = rcu_flipped; - dyntick_save_progress_counter(cpu); - } - - return 1; -} - -/* - * Wait for CPUs to acknowledge the flip. - */ - -static int -rcu_try_flip_waitack(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) - if (rcu_try_flip_waitack_needed(cpu) && - per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); - return 0; - } - - /* - * Make sure our checks above don't bleed into subsequent - * waiting for the sum of the counters to reach zero. - */ - - smp_mb(); /* see above block comment. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); - return 1; -} - -/* - * Wait for collective ``last'' counter to reach zero, - * then tell all CPUs to do an end-of-grace-period memory barrier. - */ - -static int -rcu_try_flip_waitzero(void) -{ - int cpu; - int lastidx = !(rcu_ctrlblk.completed & 0x1); - int sum = 0; - - /* Check to see if the sum of the "last" counters is zero. */ - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) - sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; - if (sum != 0) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); - return 0; - } - - /* - * This ensures that the other CPUs see the call for - * memory barriers -after- the sum to zero has been - * detected here - */ - smp_mb(); /* ^^^^^^^^^^^^ */ - - /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { - per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; - dyntick_save_progress_counter(cpu); - } - - RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); - return 1; -} - -/* - * Wait for all CPUs to do their end-of-grace-period memory barrier. - * Return 0 once all CPUs have done so. - */ - -static int -rcu_try_flip_waitmb(void) -{ - int cpu; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); - for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) - if (rcu_try_flip_waitmb_needed(cpu) && - per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); - return 0; - } - - smp_mb(); /* Ensure that the above checks precede any following flip. */ - RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); - return 1; -} - -/* - * Attempt a single flip of the counters. Remember, a single flip does - * -not- constitute a grace period. Instead, the interval between - * at least GP_STAGES consecutive flips is a grace period. - * - * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation - * on a large SMP, they might want to use a hierarchical organization of - * the per-CPU-counter pairs. - */ -static void rcu_try_flip(void) -{ - unsigned long flags; - - RCU_TRACE_ME(rcupreempt_trace_try_flip_1); - if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { - RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); - return; - } - - /* - * Take the next transition(s) through the RCU grace-period - * flip-counter state machine. - */ - - switch (rcu_ctrlblk.rcu_try_flip_state) { - case rcu_try_flip_idle_state: - if (rcu_try_flip_idle()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitack_state; - break; - case rcu_try_flip_waitack_state: - if (rcu_try_flip_waitack()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitzero_state; - break; - case rcu_try_flip_waitzero_state: - if (rcu_try_flip_waitzero()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_waitmb_state; - break; - case rcu_try_flip_waitmb_state: - if (rcu_try_flip_waitmb()) - rcu_ctrlblk.rcu_try_flip_state = - rcu_try_flip_idle_state; - } - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); -} - -/* - * Check to see if this CPU needs to do a memory barrier in order to - * ensure that any prior RCU read-side critical sections have committed - * their counter manipulations and critical-section memory references - * before declaring the grace period to be completed. - */ -static void rcu_check_mb(int cpu) -{ - if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { - smp_mb(); /* Ensure RCU read-side accesses are visible. */ - per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; - } -} - -void rcu_check_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - /* - * If this CPU took its interrupt from user mode or from the - * idle loop, and this is not a nested interrupt, then - * this CPU has to have exited all prior preept-disable - * sections of code. So increment the counter to note this. - * - * The memory barrier is needed to handle the case where - * writes from a preempt-disable section of code get reordered - * into schedule() by this CPU's write buffer. So the memory - * barrier makes sure that the rcu_qsctr_inc() is seen by other - * CPUs to happen after any such write. - */ - - if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - smp_mb(); /* Guard against aggressive schedule(). */ - rcu_qsctr_inc(cpu); - } - - rcu_check_mb(cpu); - if (rcu_ctrlblk.completed == rdp->completed) - rcu_try_flip(); - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - if (rdp->donelist == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - } else { - spin_unlock_irqrestore(&rdp->lock, flags); - raise_softirq(RCU_SOFTIRQ); - } -} - -/* - * Needed by dynticks, to make sure all RCU processing has finished - * when we go idle: - */ -void rcu_advance_callbacks(int cpu, int user) -{ - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - if (rcu_ctrlblk.completed == rdp->completed) { - rcu_try_flip(); - if (rcu_ctrlblk.completed == rdp->completed) - return; - } - spin_lock_irqsave(&rdp->lock, flags); - RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); - __rcu_advance_callbacks(rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#ifdef CONFIG_HOTPLUG_CPU -#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ - *dsttail = srclist; \ - if (srclist != NULL) { \ - dsttail = srctail; \ - srclist = NULL; \ - srctail = &srclist;\ - } \ - } while (0) - -void rcu_offline_cpu(int cpu) -{ - int i; - struct rcu_head *list = NULL; - unsigned long flags; - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - struct rcu_head *schedlist = NULL; - struct rcu_head **schedtail = &schedlist; - struct rcu_head **tail = &list; - - /* - * Remove all callbacks from the newly dead CPU, retaining order. - * Otherwise rcu_barrier() will fail - */ - - spin_lock_irqsave(&rdp->lock, flags); - rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); - for (i = GP_STAGES - 1; i >= 0; i--) - rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], - list, tail); - rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); - rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, - schedlist, schedtail); - rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, - schedlist, schedtail); - rdp->rcu_sched_sleeping = 0; - spin_unlock_irqrestore(&rdp->lock, flags); - rdp->waitlistcount = 0; - - /* Disengage the newly dead CPU from the grace-period computation. */ - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - rcu_check_mb(cpu); - if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { - smp_mb(); /* Subsequent counter accesses must see new value */ - per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; - smp_mb(); /* Subsequent RCU read-side critical sections */ - /* seen -after- acknowledgement. */ - } - - RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; - RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; - - RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; - RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; - - cpu_clear(cpu, rcu_cpu_online_map); - - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * Place the removed callbacks on the current CPU's queue. - * Make them all start a new grace period: simple approach, - * in theory could starve a given set of callbacks, but - * you would need to be doing some serious CPU hotplugging - * to make this happen. If this becomes a problem, adding - * a synchronize_rcu() to the hotplug path would be a simple - * fix. - */ - - local_irq_save(flags); /* disable preempt till we know what lock. */ - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nexttail = list; - if (list) - rdp->nexttail = tail; - *rdp->nextschedtail = schedlist; - if (schedlist) - rdp->nextschedtail = schedtail; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -void rcu_offline_cpu(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -void __cpuinit rcu_online_cpu(int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - - spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); - cpu_set(cpu, rcu_cpu_online_map); - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); - - /* - * The rcu_sched grace-period processing might have bypassed - * this CPU, given that it was not in the rcu_cpu_online_map - * when the grace-period scan started. This means that the - * grace-period task might sleep. So make sure that if this - * should happen, the first callback posted to this CPU will - * wake up the grace-period task if need be. - */ - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - rdp->rcu_sched_sleeping = 1; - spin_unlock_irqrestore(&rdp->lock, flags); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - unsigned long flags; - struct rcu_head *next, *list; - struct rcu_data *rdp; - - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - list = rdp->donelist; - if (list == NULL) { - spin_unlock_irqrestore(&rdp->lock, flags); - return; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); - while (list) { - next = list->next; - list->func(list); - list = next; - RCU_TRACE_ME(rcupreempt_trace_invoke); - } -} - -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - __rcu_advance_callbacks(rdp); - *rdp->nexttail = head; - rdp->nexttail = &head->next; - RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock_irqrestore(&rdp->lock, flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - struct rcu_data *rdp; - int wake_gp = 0; - - head->func = func; - head->next = NULL; - local_irq_save(flags); - rdp = RCU_DATA_ME(); - spin_lock(&rdp->lock); - *rdp->nextschedtail = head; - rdp->nextschedtail = &head->next; - if (rdp->rcu_sched_sleeping) { - - /* Grace-period processing might be sleeping... */ - - rdp->rcu_sched_sleeping = 0; - wake_gp = 1; - } - spin_unlock_irqrestore(&rdp->lock, flags); - if (wake_gp) { - - /* Wake up grace-period processing, unless someone beat us. */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) - wake_gp = 0; - rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - if (wake_gp) - wake_up_interruptible(&rcu_ctrlblk.sched_wq); - } -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Wait until all currently running preempt_disable() code segments - * (including hardware-irq-disable segments) complete. Note that - * in -rt this does -not- necessarily result in all currently executing - * interrupt -handlers- having completed. - */ -synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) -EXPORT_SYMBOL_GPL(__synchronize_sched); - -/* - * kthread function that manages call_rcu_sched grace periods. - */ -static int rcu_sched_grace_period(void *arg) -{ - int couldsleep; /* might sleep after current pass. */ - int couldsleepnext = 0; /* might sleep after next pass. */ - int cpu; - unsigned long flags; - struct rcu_data *rdp; - int ret; - - /* - * Each pass through the following loop handles one - * rcu_sched grace period cycle. - */ - do { - /* Save each CPU's current state. */ - - for_each_online_cpu(cpu) { - dyntick_save_progress_counter_sched(cpu); - save_qsctr_sched(cpu); - } - - /* - * Sleep for about an RCU grace-period's worth to - * allow better batching and to consume less CPU. - */ - schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); - - /* - * If there was nothing to do last time, prepare to - * sleep at the end of the current grace period cycle. - */ - couldsleep = couldsleepnext; - couldsleepnext = 1; - if (couldsleep) { - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - } - - /* - * Wait on each CPU in turn to have either visited - * a quiescent state or been in dynticks-idle mode. - */ - for_each_online_cpu(cpu) { - while (rcu_qsctr_inc_needed(cpu) && - rcu_qsctr_inc_needed_dyntick(cpu)) { - /* resched_cpu(cpu); @@@ */ - schedule_timeout_interruptible(1); - } - } - - /* Advance callbacks for each CPU. */ - - for_each_online_cpu(cpu) { - - rdp = RCU_DATA_CPU(cpu); - spin_lock_irqsave(&rdp->lock, flags); - - /* - * We are running on this CPU irq-disabled, so no - * CPU can go offline until we re-enable irqs. - * The current CPU might have already gone - * offline (between the for_each_offline_cpu and - * the spin_lock_irqsave), but in that case all its - * callback lists will be empty, so no harm done. - * - * Advance the callbacks! We share normal RCU's - * donelist, since callbacks are invoked the - * same way in either case. - */ - if (rdp->waitschedlist != NULL) { - *rdp->donetail = rdp->waitschedlist; - rdp->donetail = rdp->waitschedtail; - - /* - * Next rcu_check_callbacks() will - * do the required raise_softirq(). - */ - } - if (rdp->nextschedlist != NULL) { - rdp->waitschedlist = rdp->nextschedlist; - rdp->waitschedtail = rdp->nextschedtail; - couldsleep = 0; - couldsleepnext = 0; - } else { - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - } - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - - /* Mark sleep intention. */ - - rdp->rcu_sched_sleeping = couldsleep; - - spin_unlock_irqrestore(&rdp->lock, flags); - } - - /* If we saw callbacks on the last scan, go deal with them. */ - - if (!couldsleep) - continue; - - /* Attempt to block... */ - - spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); - if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { - - /* - * Someone posted a callback after we scanned. - * Go take care of it. - */ - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - couldsleepnext = 0; - continue; - } - - /* Block until the next person posts a callback. */ - - rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; - spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; - __wait_event_interruptible(rcu_ctrlblk.sched_wq, - rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, - ret); - - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); - couldsleepnext = 0; - - } while (!kthread_should_stop()); - - return (0); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. Assumes that notifiers would take care of handling any - * outstanding requests from the RCU core. - * - * This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL); -} - -int rcu_pending(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - /* The CPU has at least one callback queued somewhere. */ - - if (rdp->donelist != NULL || - !!rdp->waitlistcount || - rdp->nextlist != NULL || - rdp->nextschedlist != NULL || - rdp->waitschedlist != NULL) - return 1; - - /* The RCU core needs an acknowledgement from this CPU. */ - - if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || - (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) - return 1; - - /* This CPU has fallen behind the global grace-period number. */ - - if (rdp->completed != rcu_ctrlblk.completed) - return 1; - - /* Nothing needed from this CPU. */ - - return 0; -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -void __init __rcu_init(void) -{ - int cpu; - int i; - struct rcu_data *rdp; - - printk(KERN_NOTICE "Preemptible RCU implementation.\n"); - for_each_possible_cpu(cpu) { - rdp = RCU_DATA_CPU(cpu); - spin_lock_init(&rdp->lock); - rdp->completed = 0; - rdp->waitlistcount = 0; - rdp->nextlist = NULL; - rdp->nexttail = &rdp->nextlist; - for (i = 0; i < GP_STAGES; i++) { - rdp->waitlist[i] = NULL; - rdp->waittail[i] = &rdp->waitlist[i]; - } - rdp->donelist = NULL; - rdp->donetail = &rdp->donelist; - rdp->rcu_flipctr[0] = 0; - rdp->rcu_flipctr[1] = 0; - rdp->nextschedlist = NULL; - rdp->nextschedtail = &rdp->nextschedlist; - rdp->waitschedlist = NULL; - rdp->waitschedtail = &rdp->waitschedlist; - rdp->rcu_sched_sleeping = 0; - } - register_cpu_notifier(&rcu_nb); - - /* - * We don't need protection against CPU-Hotplug here - * since - * a) If a CPU comes online while we are iterating over the - * cpu_online_map below, we would only end up making a - * duplicate call to rcu_online_cpu() which sets the corresponding - * CPU's mask in the rcu_cpu_online_map. - * - * b) A CPU cannot go offline at this point in time since the user - * does not have access to the sysfs interface, nor do we - * suspend the system. - */ - for_each_online_cpu(cpu) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); - - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -/* - * Late-boot-time RCU initialization that must wait until after scheduler - * has been initialized. - */ -void __init rcu_init_sched(void) -{ - rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, - NULL, - "rcu_sched_grace_period"); - WARN_ON(IS_ERR(rcu_sched_grace_period_task)); -} - -#ifdef CONFIG_RCU_TRACE -long *rcupreempt_flipctr(int cpu) -{ - return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; -} -EXPORT_SYMBOL_GPL(rcupreempt_flipctr); - -int rcupreempt_flip_flag(int cpu) -{ - return per_cpu(rcu_flip_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); - -int rcupreempt_mb_flag(int cpu) -{ - return per_cpu(rcu_mb_flag, cpu); -} -EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); - -char *rcupreempt_try_flip_state_name(void) -{ - return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; -} -EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); - -struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) -{ - struct rcu_data *rdp = RCU_DATA_CPU(cpu); - - return &rdp->trace; -} -EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); - -#endif /* #ifdef RCU_TRACE */ -/* - * Read-Copy Update tracing for realtime implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2006 - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct mutex rcupreempt_trace_mutex; -static char *rcupreempt_trace_buf; -#define RCUPREEMPT_TRACE_BUF_SIZE 4096 - -void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) -{ - trace->done_length += trace->wait_length; - trace->done_add += trace->wait_length; - trace->wait_length = 0; -} -void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) -{ - trace->wait_length += trace->next_length; - trace->wait_add += trace->next_length; - trace->next_length = 0; -} -void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_1); -} -void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->rcu_try_flip_e1); -} -void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_i1++; -} -void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ie1++; -} -void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_g1++; -} -void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a1++; -} -void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ae1++; -} -void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_a2++; -} -void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z1++; -} -void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_ze1++; -} -void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_z2++; -} -void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m1++; -} -void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_me1++; -} -void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) -{ - trace->rcu_try_flip_m2++; -} -void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) -{ - trace->rcu_check_callbacks++; -} -void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) -{ - trace->done_remove += trace->done_length; - trace->done_length = 0; -} -void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) -{ - atomic_inc(&trace->done_invoked); -} -void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) -{ - trace->next_add++; - trace->next_length++; -} - -static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) -{ - struct rcupreempt_trace *cp; - int cpu; - - memset(sp, 0, sizeof(*sp)); - for_each_possible_cpu(cpu) { - cp = rcupreempt_trace_cpu(cpu); - sp->next_length += cp->next_length; - sp->next_add += cp->next_add; - sp->wait_length += cp->wait_length; - sp->wait_add += cp->wait_add; - sp->done_length += cp->done_length; - sp->done_add += cp->done_add; - sp->done_remove += cp->done_remove; - atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); - sp->rcu_check_callbacks += cp->rcu_check_callbacks; - atomic_set(&sp->rcu_try_flip_1, - atomic_read(&cp->rcu_try_flip_1)); - atomic_set(&sp->rcu_try_flip_e1, - atomic_read(&cp->rcu_try_flip_e1)); - sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; - sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; - sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; - sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; - sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; - sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; - sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; - sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; - sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; - sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; - sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; - sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; - } -} - -static ssize_t rcustats_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct rcupreempt_trace trace; - ssize_t bcount; - int cnt = 0; - - rcupreempt_trace_sum(&trace); - mutex_lock(&rcupreempt_trace_mutex); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp=%ld rcc=%ld\n", - rcu_batches_completed(), - trace.rcu_check_callbacks); - snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" - "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" - "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", - - trace.next_add, trace.next_length, - trace.wait_add, trace.wait_length, - trace.done_add, trace.done_length, - trace.done_remove, atomic_read(&trace.done_invoked), - atomic_read(&trace.rcu_try_flip_1), - atomic_read(&trace.rcu_try_flip_e1), - trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, - trace.rcu_try_flip_g1, - trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, - trace.rcu_try_flip_a2, - trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, - trace.rcu_try_flip_z2, - trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, - trace.rcu_try_flip_m2); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcugp_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - long oldgp = rcu_batches_completed(); - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - synchronize_rcu(); - snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, - "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - int cnt = 0; - int cpu; - int f = rcu_batches_completed() & 0x1; - ssize_t bcount; - - mutex_lock(&rcupreempt_trace_mutex); - - cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, - "CPU last cur F M\n"); - for_each_online_cpu(cpu) { - long *flipctr = rcupreempt_flipctr(cpu); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "%3d %4ld %3ld %d %d\n", - cpu, - flipctr[!f], - flipctr[f], - rcupreempt_flip_flag(cpu), - rcupreempt_mb_flag(cpu)); - } - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "ggp = %ld, state = %s\n", - rcu_batches_completed(), - rcupreempt_try_flip_state_name()); - cnt += snprintf(&rcupreempt_trace_buf[cnt], - RCUPREEMPT_TRACE_BUF_SIZE - cnt, - "\n"); - bcount = simple_read_from_buffer(buffer, count, ppos, - rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); - mutex_unlock(&rcupreempt_trace_mutex); - return bcount; -} - -static struct file_operations rcustats_fops = { - .owner = THIS_MODULE, - .read = rcustats_read, -}; - -static struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .read = rcugp_read, -}; - -static struct file_operations rcuctrs_fops = { - .owner = THIS_MODULE, - .read = rcuctrs_read, -}; - -static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; -static int rcupreempt_debugfs_init(void) -{ - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto out; - statdir = debugfs_create_file("rcustats", 0444, rcudir, - NULL, &rcustats_fops); - if (!statdir) - goto free_out; - - gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!gpdir) - goto free_out; - - ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, - NULL, &rcuctrs_fops); - if (!ctrsdir) - goto free_out; - return 0; -free_out: - if (statdir) - debugfs_remove(statdir); - if (gpdir) - debugfs_remove(gpdir); - debugfs_remove(rcudir); -out: - return 1; -} - -static int __init rcupreempt_trace_init(void) -{ - mutex_init(&rcupreempt_trace_mutex); - rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); - if (!rcupreempt_trace_buf) - return 1; - return rcupreempt_debugfs_init(); -} - -static void __exit rcupreempt_trace_cleanup(void) -{ - debugfs_remove(statdir); - debugfs_remove(gpdir); - debugfs_remove(ctrsdir); - debugfs_remove(rcudir); - kfree(rcupreempt_trace_buf); -} - - -module_init(rcupreempt_trace_init); -module_exit(rcupreempt_trace_cleanup); -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney - * Josh Triplett - * - * See also: Documentation/RCU/torture.txt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and " - "Josh Triplett "); - -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval; /* Interval between stats, in seconds. */ - /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ -static int stutter = 5; /* Start/stop testing interval (in sec) */ -static int irqreader = 1; /* RCU readers from irq (timers). */ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0444); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static char printk_buf[4096]; - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static int fullstop = 0; /* stop generating callbacks at test end. */ -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current = NULL; -static long rcu_torture_current_version = 0; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static long n_rcu_torture_timers = 0; -static struct list_head rcu_torture_removed; - -static int stutter_pause_test = 0; - -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(void) -{ - while (stutter_pause_test || !rcutorture_runnable) - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - void (*cleanup)(void); - int (*readlock)(void); - void (*readdelay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferredfree)(struct rcu_torture *p); - void (*sync)(void); - void (*cb_barrier)(void); - int (*stats)(char *page); - int irqcapable; - char *name; -}; -static struct rcu_torture_ops *cur_ops = NULL; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long longdelay = 200; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); - if (!delay) - udelay(longdelay); -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - cur_ops->deferredfree(rp); -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = rcu_barrier, - .stats = NULL, - .irqcapable = 1, - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_sync" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -struct rcu_bh_torture_synchronize { - struct rcu_head head; - struct completion completion; -}; - -static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) -{ - struct rcu_bh_torture_synchronize *rcu; - - rcu = container_of(head, struct rcu_bh_torture_synchronize, head); - complete(&rcu->completion); -} - -static void rcu_bh_torture_synchronize(void) -{ - struct rcu_bh_torture_synchronize rcu; - - init_completion(&rcu.completion); - call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); - wait_for_completion(&rcu.completion); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = rcu_barrier_bh, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh" -}; - -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh_sync" -}; - -/* - * Definitions for srcu torture testing. - */ - -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static int srcu_torture_stats(char *page) -{ - int cnt = 0; - int cpu; - int idx = srcu_ctl.completed & 0x1; - - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - cnt += sprintf(&page[cnt], "\n"); - return cnt; -} - -static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .readdelay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static int sched_torture_completed(void) -{ - return 0; -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static void sched_torture_synchronize(void) -{ - synchronize_sched(); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = rcu_barrier_sched, - .stats = NULL, - .irqcapable = 1, - .name = "sched" -}; - -static struct rcu_torture_ops sched_ops_sync = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .name = "sched_sync" -}; - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - int i; - long oldbatch = rcu_batches_completed(); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - if ((rp = rcu_torture_alloc()) == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - cur_ops->deferredfree(old_rp); - } - rcu_torture_current_version++; - oldbatch = cur_ops->completed(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static void rcu_torture_timer(unsigned long unused) -{ - int idx; - int completed; - static DEFINE_RCU_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); - struct rcu_torture *p; - int pipe_count; - - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - p = rcu_dereference(rcu_torture_current); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->readdelay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = cur_ops->completed() - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_batch)[completed]; - preempt_enable(); - cur_ops->readunlock(idx); -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - struct timer_list t; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - if (irqreader && cur_ops->irqcapable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); - - do { - if (irqreader && cur_ops->irqcapable) { - if (!timer_pending(&t)) - mod_timer(&t, 1); - } - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - p = rcu_dereference(rcu_torture_current); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->readdelay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = cur_ops->completed() - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - ++__get_cpu_var(rcu_torture_batch)[completed]; - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - if (irqreader && cur_ops->irqcapable) - del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static int -rcu_torture_printk(char *page) -{ - int cnt = 0; - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d nt: %ld", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_timers); - if (atomic_read(&n_rcu_torture_mberror) != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { - cnt += sprintf(&page[cnt], "!!! "); - atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); - } - cnt += sprintf(&page[cnt], "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", - atomic_read(&rcu_torture_wcount[i])); - } - cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int cnt; - - cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - cpumask_t tmp_mask; - int i; - - cpus_setall(tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpu_clear(rcu_idle_cpu, tmp_mask); - - set_cpus_allowed_ptr(current, &tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - &tmp_mask); - } - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - &tmp_mask); - } - - if (writer_task) - set_cpus_allowed_ptr(writer_task, &tmp_mask); - - if (stats_task) - set_cpus_allowed_ptr(stats_task, &tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(char *tag) -{ - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader); -} - -static void -rcu_torture_cleanup(void) -{ - int i; - - fullstop = 1; - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - /* Wait for all RCU callbacks to fire. */ - - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (cur_ops->cleanup) - cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) - rcu_torture_print_module_parms("End of test: FAILURE"); - else - rcu_torture_print_module_parms("End of test: SUCCESS"); -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &sched_ops, &sched_ops_sync, }; - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", - torture_type); - return (-EINVAL); - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms("Start of test"); - fullstop = 0; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - if (stutter < 0) - stutter = 0; - if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - goto unwind; - } - } - return 0; - -unwind: - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); -/* - * Public API and common code for kernel->userspace relay file support. - * - * See Documentation/filesystems/relay.txt for an overview. - * - * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp - * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) - * - * Moved to kernel/relay.c by Paul Mundt, 2006. - * November 2006 - CPU hotplug support by Mathieu Desnoyers - * (mathieu.desnoyers@polymtl.ca) - * - * This file is released under the GPL. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* list of open channels, for cpu hotplug */ -static DEFINE_MUTEX(relay_channels_mutex); -static LIST_HEAD(relay_channels); - -/* - * close() vm_op implementation for relay file mapping. - */ -static void relay_file_mmap_close(struct vm_area_struct *vma) -{ - struct rchan_buf *buf = vma->vm_private_data; - buf->chan->cb->buf_unmapped(buf, vma->vm_file); -} - -/* - * fault() vm_op implementation for relay file mapping. - */ -static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page; - struct rchan_buf *buf = vma->vm_private_data; - pgoff_t pgoff = vmf->pgoff; - - if (!buf) - return VM_FAULT_OOM; - - page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT)); - if (!page) - return VM_FAULT_SIGBUS; - get_page(page); - vmf->page = page; - - return 0; -} - -/* - * vm_ops for relay file mappings. - */ -static struct vm_operations_struct relay_file_mmap_ops = { - .fault = relay_buf_fault, - .close = relay_file_mmap_close, -}; - -/* - * allocate an array of pointers of struct page - */ -static struct page **relay_alloc_page_array(unsigned int n_pages) -{ - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; -} - -/* - * free an array of pointers of struct page - */ -static void relay_free_page_array(struct page **array) -{ - if (is_vmalloc_addr(array)) - vfree(array); - else - kfree(array); -} - -/** - * relay_mmap_buf: - mmap channel buffer to process address space - * @buf: relay channel buffer - * @vma: vm_area_struct describing memory to be mapped - * - * Returns 0 if ok, negative on error - * - * Caller should already have grabbed mmap_sem. - */ -static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) -{ - unsigned long length = vma->vm_end - vma->vm_start; - struct file *filp = vma->vm_file; - - if (!buf) - return -EBADF; - - if (length != (unsigned long)buf->chan->alloc_size) - return -EINVAL; - - vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_private_data = buf; - buf->chan->cb->buf_mapped(buf, filp); - - return 0; -} - -/** - * relay_alloc_buf - allocate a channel buffer - * @buf: the buffer struct - * @size: total size of the buffer - * - * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The - * passed in size will get page aligned, if it isn't already. - */ -static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) -{ - void *mem; - unsigned int i, j, n_pages; - - *size = PAGE_ALIGN(*size); - n_pages = *size >> PAGE_SHIFT; - - buf->page_array = relay_alloc_page_array(n_pages); - if (!buf->page_array) - return NULL; - - for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); - if (unlikely(!buf->page_array[i])) - goto depopulate; - set_page_private(buf->page_array[i], (unsigned long)buf); - } - mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); - if (!mem) - goto depopulate; - - memset(mem, 0, *size); - buf->page_count = n_pages; - return mem; - -depopulate: - for (j = 0; j < i; j++) - __free_page(buf->page_array[j]); - relay_free_page_array(buf->page_array); - return NULL; -} - -/** - * relay_create_buf - allocate and initialize a channel buffer - * @chan: the relay channel - * - * Returns channel buffer if successful, %NULL otherwise. - */ -static struct rchan_buf *relay_create_buf(struct rchan *chan) -{ - struct rchan_buf *buf; - - if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) - return NULL; - - buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) - return NULL; - buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); - if (!buf->padding) - goto free_buf; - - buf->start = relay_alloc_buf(buf, &chan->alloc_size); - if (!buf->start) - goto free_buf; - - buf->chan = chan; - kref_get(&buf->chan->kref); - return buf; - -free_buf: - kfree(buf->padding); - kfree(buf); - return NULL; -} - -/** - * relay_destroy_channel - free the channel struct - * @kref: target kernel reference that contains the relay channel - * - * Should only be called from kref_put(). - */ -static void relay_destroy_channel(struct kref *kref) -{ - struct rchan *chan = container_of(kref, struct rchan, kref); - kfree(chan); -} - -/** - * relay_destroy_buf - destroy an rchan_buf struct and associated buffer - * @buf: the buffer struct - */ -static void relay_destroy_buf(struct rchan_buf *buf) -{ - struct rchan *chan = buf->chan; - unsigned int i; - - if (likely(buf->start)) { - vunmap(buf->start); - for (i = 0; i < buf->page_count; i++) - __free_page(buf->page_array[i]); - relay_free_page_array(buf->page_array); - } - chan->buf[buf->cpu] = NULL; - kfree(buf->padding); - kfree(buf); - kref_put(&chan->kref, relay_destroy_channel); -} - -/** - * relay_remove_buf - remove a channel buffer - * @kref: target kernel reference that contains the relay buffer - * - * Removes the file from the fileystem, which also frees the - * rchan_buf_struct and the channel buffer. Should only be called from - * kref_put(). - */ -static void relay_remove_buf(struct kref *kref) -{ - struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - buf->chan->cb->remove_buf_file(buf->dentry); - relay_destroy_buf(buf); -} - -/** - * relay_buf_empty - boolean, is the channel buffer empty? - * @buf: channel buffer - * - * Returns 1 if the buffer is empty, 0 otherwise. - */ -static int relay_buf_empty(struct rchan_buf *buf) -{ - return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; -} - -/** - * relay_buf_full - boolean, is the channel buffer full? - * @buf: channel buffer - * - * Returns 1 if the buffer is full, 0 otherwise. - */ -int relay_buf_full(struct rchan_buf *buf) -{ - size_t ready = buf->subbufs_produced - buf->subbufs_consumed; - return (ready >= buf->chan->n_subbufs) ? 1 : 0; -} -EXPORT_SYMBOL_GPL(relay_buf_full); - -/* - * High-level relay kernel API and associated functions. - */ - -/* - * rchan_callback implementations defining default channel behavior. Used - * in place of corresponding NULL values in client callback struct. - */ - -/* - * subbuf_start() default callback. Does nothing. - */ -static int subbuf_start_default_callback (struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - size_t prev_padding) -{ - if (relay_buf_full(buf)) - return 0; - - return 1; -} - -/* - * buf_mapped() default callback. Does nothing. - */ -static void buf_mapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - -/* - * buf_unmapped() default callback. Does nothing. - */ -static void buf_unmapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - -/* - * create_buf_file_create() default callback. Does nothing. - */ -static struct dentry *create_buf_file_default_callback(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return NULL; -} - -/* - * remove_buf_file() default callback. Does nothing. - */ -static int remove_buf_file_default_callback(struct dentry *dentry) -{ - return -EINVAL; -} - -/* relay channel default callbacks */ -static struct rchan_callbacks default_channel_callbacks = { - .subbuf_start = subbuf_start_default_callback, - .buf_mapped = buf_mapped_default_callback, - .buf_unmapped = buf_unmapped_default_callback, - .create_buf_file = create_buf_file_default_callback, - .remove_buf_file = remove_buf_file_default_callback, -}; - -/** - * wakeup_readers - wake up readers waiting on a channel - * @data: contains the channel buffer - * - * This is the timer function used to defer reader waking. - */ -static void wakeup_readers(unsigned long data) -{ - struct rchan_buf *buf = (struct rchan_buf *)data; - wake_up_interruptible(&buf->read_wait); -} - -/** - * __relay_reset - reset a channel buffer - * @buf: the channel buffer - * @init: 1 if this is a first-time initialization - * - * See relay_reset() for description of effect. - */ -static void __relay_reset(struct rchan_buf *buf, unsigned int init) -{ - size_t i; - - if (init) { - init_waitqueue_head(&buf->read_wait); - kref_init(&buf->kref); - setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - } else - del_timer_sync(&buf->timer); - - buf->subbufs_produced = 0; - buf->subbufs_consumed = 0; - buf->bytes_consumed = 0; - buf->finalized = 0; - buf->data = buf->start; - buf->offset = 0; - - for (i = 0; i < buf->chan->n_subbufs; i++) - buf->padding[i] = 0; - - buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); -} - -/** - * relay_reset - reset the channel - * @chan: the channel - * - * This has the effect of erasing all data from all channel buffers - * and restarting the channel in its initial state. The buffers - * are not freed, so any mappings are still in effect. - * - * NOTE. Care should be taken that the channel isn't actually - * being used by anything when this call is made. - */ -void relay_reset(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); - return; - } - - mutex_lock(&relay_channels_mutex); - for_each_online_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_reset); - -static inline void relay_set_buf_dentry(struct rchan_buf *buf, - struct dentry *dentry) -{ - buf->dentry = dentry; - buf->dentry->d_inode->i_size = buf->early_bytes; -} - -static struct dentry *relay_create_buf_file(struct rchan *chan, - struct rchan_buf *buf, - unsigned int cpu) -{ - struct dentry *dentry; - char *tmpname; - - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); - - /* Create file in fs */ - dentry = chan->cb->create_buf_file(tmpname, chan->parent, - S_IRUSR, buf, - &chan->is_global); - - kfree(tmpname); - - return dentry; -} - -/* - * relay_open_buf - create a new relay channel buffer - * - * used by relay_open() and CPU hotplug. - */ -static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) -{ - struct rchan_buf *buf = NULL; - struct dentry *dentry; - - if (chan->is_global) - return chan->buf[0]; - - buf = relay_create_buf(chan); - if (!buf) - return NULL; - - if (chan->has_base_filename) { - dentry = relay_create_buf_file(chan, buf, cpu); - if (!dentry) - goto free_buf; - relay_set_buf_dentry(buf, dentry); - } - - buf->cpu = cpu; - __relay_reset(buf, 1); - - if(chan->is_global) { - chan->buf[0] = buf; - buf->cpu = 0; - } - - return buf; - -free_buf: - relay_destroy_buf(buf); - return NULL; -} - -/** - * relay_close_buf - close a channel buffer - * @buf: channel buffer - * - * Marks the buffer finalized and restores the default callbacks. - * The channel buffer and channel buffer data structure are then freed - * automatically when the last reference is given up. - */ -static void relay_close_buf(struct rchan_buf *buf) -{ - buf->finalized = 1; - del_timer_sync(&buf->timer); - kref_put(&buf->kref, relay_remove_buf); -} - -static void setup_callbacks(struct rchan *chan, - struct rchan_callbacks *cb) -{ - if (!cb) { - chan->cb = &default_channel_callbacks; - return; - } - - if (!cb->subbuf_start) - cb->subbuf_start = subbuf_start_default_callback; - if (!cb->buf_mapped) - cb->buf_mapped = buf_mapped_default_callback; - if (!cb->buf_unmapped) - cb->buf_unmapped = buf_unmapped_default_callback; - if (!cb->create_buf_file) - cb->create_buf_file = create_buf_file_default_callback; - if (!cb->remove_buf_file) - cb->remove_buf_file = remove_buf_file_default_callback; - chan->cb = cb; -} - -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) -{ - unsigned int hotcpu = (unsigned long)hcpu; - struct rchan *chan; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) - continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return NOTIFY_BAD; - } - } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; - } - return NOTIFY_OK; -} - -/** - * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only - * @parent: dentry of parent directory, %NULL for root directory or buffer - * @subbuf_size: size of sub-buffers - * @n_subbufs: number of sub-buffers - * @cb: client callback functions - * @private_data: user-defined data - * - * Returns channel pointer if successful, %NULL otherwise. - * - * Creates a channel buffer for each cpu using the sizes and - * attributes specified. The created channel buffer files - * will be named base_filename0...base_filenameN-1. File - * permissions will be %S_IRUSR. - */ -struct rchan *relay_open(const char *base_filename, - struct dentry *parent, - size_t subbuf_size, - size_t n_subbufs, - struct rchan_callbacks *cb, - void *private_data) -{ - unsigned int i; - struct rchan *chan; - - if (!(subbuf_size && n_subbufs)) - return NULL; - if (subbuf_size > UINT_MAX / n_subbufs) - return NULL; - - chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); - if (!chan) - return NULL; - - chan->version = RELAYFS_CHANNEL_VERSION; - chan->n_subbufs = n_subbufs; - chan->subbuf_size = subbuf_size; - chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); - chan->parent = parent; - chan->private_data = private_data; - if (base_filename) { - chan->has_base_filename = 1; - strlcpy(chan->base_filename, base_filename, NAME_MAX); - } - setup_callbacks(chan, cb); - kref_init(&chan->kref); - - mutex_lock(&relay_channels_mutex); - for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) - goto free_bufs; - } - list_add(&chan->list, &relay_channels); - mutex_unlock(&relay_channels_mutex); - - return chan; - -free_bufs: - for_each_online_cpu(i) { - if (!chan->buf[i]) - break; - relay_close_buf(chan->buf[i]); - } - - kref_put(&chan->kref, relay_destroy_channel); - mutex_unlock(&relay_channels_mutex); - return NULL; -} -EXPORT_SYMBOL_GPL(relay_open); - -struct rchan_percpu_buf_dispatcher { - struct rchan_buf *buf; - struct dentry *dentry; -}; - -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel. - * Useful to do early tracing in kernel, before VFS is up, for example. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strlcpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { - printk(KERN_ERR "relay_late_setup_files: CPU %u " - "has no buffer, it must have!\n", i); - BUG(); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, chan->buf[i], i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); - local_irq_restore(flags); - } else { - disp.buf = chan->buf[i]; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} - -/** - * relay_switch_subbuf - switch to a new sub-buffer - * @buf: channel buffer - * @length: size of current event - * - * Returns either the length passed in or 0 if full. - * - * Performs sub-buffer-switch tasks such as invoking callbacks, - * updating padding counts, waking up readers, etc. - */ -size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) -{ - void *old, *new; - size_t old_subbuf, new_subbuf; - - if (unlikely(length > buf->chan->subbuf_size)) - goto toobig; - - if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; - old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; - buf->subbufs_produced++; - if (buf->dentry) - buf->dentry->d_inode->i_size += - buf->chan->subbuf_size - - buf->padding[old_subbuf]; - else - buf->early_bytes += buf->chan->subbuf_size - - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - __mod_timer(&buf->timer, jiffies + 1); - } - - old = buf->data; - new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - new = buf->start + new_subbuf * buf->chan->subbuf_size; - buf->offset = 0; - if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { - buf->offset = buf->chan->subbuf_size + 1; - return 0; - } - buf->data = new; - buf->padding[new_subbuf] = 0; - - if (unlikely(length + buf->offset > buf->chan->subbuf_size)) - goto toobig; - - return length; - -toobig: - buf->chan->last_toobig = length; - return 0; -} -EXPORT_SYMBOL_GPL(relay_switch_subbuf); - -/** - * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count - * @chan: the channel - * @cpu: the cpu associated with the channel buffer to update - * @subbufs_consumed: number of sub-buffers to add to current buf's count - * - * Adds to the channel buffer's consumed sub-buffer count. - * subbufs_consumed should be the number of sub-buffers newly consumed, - * not the total consumed. - * - * NOTE. Kernel clients don't need to call this function if the channel - * mode is 'overwrite'. - */ -void relay_subbufs_consumed(struct rchan *chan, - unsigned int cpu, - size_t subbufs_consumed) -{ - struct rchan_buf *buf; - - if (!chan) - return; - - if (cpu >= NR_CPUS || !chan->buf[cpu]) - return; - - buf = chan->buf[cpu]; - buf->subbufs_consumed += subbufs_consumed; - if (buf->subbufs_consumed > buf->subbufs_produced) - buf->subbufs_consumed = buf->subbufs_produced; -} -EXPORT_SYMBOL_GPL(relay_subbufs_consumed); - -/** - * relay_close - close the channel - * @chan: the channel - * - * Closes all channel buffers and frees the channel. - */ -void relay_close(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); - else - for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); - - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%Zd) > sub-buffer size (%Zd)]\n", - chan->last_toobig, chan->subbuf_size); - - list_del(&chan->list); - kref_put(&chan->kref, relay_destroy_channel); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_close); - -/** - * relay_flush - close the channel - * @chan: the channel - * - * Flushes all channel buffers, i.e. forces buffer switch. - */ -void relay_flush(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); - return; - } - - mutex_lock(&relay_channels_mutex); - for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_flush); - -/** - * relay_file_open - open file op for relay files - * @inode: the inode - * @filp: the file - * - * Increments the channel buffer refcount. - */ -static int relay_file_open(struct inode *inode, struct file *filp) -{ - struct rchan_buf *buf = inode->i_private; - kref_get(&buf->kref); - filp->private_data = buf; - - return nonseekable_open(inode, filp); -} - -/** - * relay_file_mmap - mmap file op for relay files - * @filp: the file - * @vma: the vma describing what to map - * - * Calls upon relay_mmap_buf() to map the file into user space. - */ -static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct rchan_buf *buf = filp->private_data; - return relay_mmap_buf(buf, vma); -} - -/** - * relay_file_poll - poll file op for relay files - * @filp: the file - * @wait: poll table - * - * Poll implemention. - */ -static unsigned int relay_file_poll(struct file *filp, poll_table *wait) -{ - unsigned int mask = 0; - struct rchan_buf *buf = filp->private_data; - - if (buf->finalized) - return POLLERR; - - if (filp->f_mode & FMODE_READ) { - poll_wait(filp, &buf->read_wait, wait); - if (!relay_buf_empty(buf)) - mask |= POLLIN | POLLRDNORM; - } - - return mask; -} - -/** - * relay_file_release - release file op for relay files - * @inode: the inode - * @filp: the file - * - * Decrements the channel refcount, as the filesystem is - * no longer using it. - */ -static int relay_file_release(struct inode *inode, struct file *filp) -{ - struct rchan_buf *buf = filp->private_data; - kref_put(&buf->kref, relay_remove_buf); - - return 0; -} - -/* - * relay_file_read_consume - update the consumed count for the buffer - */ -static void relay_file_read_consume(struct rchan_buf *buf, - size_t read_pos, - size_t bytes_consumed) -{ - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t read_subbuf; - - if (buf->subbufs_produced == buf->subbufs_consumed && - buf->offset == buf->bytes_consumed) - return; - - if (buf->bytes_consumed + bytes_consumed > subbuf_size) { - relay_subbufs_consumed(buf->chan, buf->cpu, 1); - buf->bytes_consumed = 0; - } - - buf->bytes_consumed += bytes_consumed; - if (!read_pos) - read_subbuf = buf->subbufs_consumed % n_subbufs; - else - read_subbuf = read_pos / buf->chan->subbuf_size; - if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { - if ((read_subbuf == buf->subbufs_produced % n_subbufs) && - (buf->offset == subbuf_size)) - return; - relay_subbufs_consumed(buf->chan, buf->cpu, 1); - buf->bytes_consumed = 0; - } -} - -/* - * relay_file_read_avail - boolean, are there unconsumed bytes available? - */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) -{ - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; - - relay_file_read_consume(buf, read_pos, 0); - - consumed = buf->subbufs_consumed; - - if (unlikely(buf->offset > subbuf_size)) { - if (produced == consumed) - return 0; - return 1; - } - - if (unlikely(produced - consumed >= n_subbufs)) { - consumed = produced - n_subbufs + 1; - buf->subbufs_consumed = consumed; - buf->bytes_consumed = 0; - } - - produced = (produced % n_subbufs) * subbuf_size + buf->offset; - consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; - - if (consumed > produced) - produced += n_subbufs * subbuf_size; - - if (consumed == produced) { - if (buf->offset == subbuf_size && - buf->subbufs_produced > buf->subbufs_consumed) - return 1; - return 0; - } - - return 1; -} - -/** - * relay_file_read_subbuf_avail - return bytes available in sub-buffer - * @read_pos: file read position - * @buf: relay channel buffer - */ -static size_t relay_file_read_subbuf_avail(size_t read_pos, - struct rchan_buf *buf) -{ - size_t padding, avail = 0; - size_t read_subbuf, read_offset, write_subbuf, write_offset; - size_t subbuf_size = buf->chan->subbuf_size; - - write_subbuf = (buf->data - buf->start) / subbuf_size; - write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; - read_subbuf = read_pos / subbuf_size; - read_offset = read_pos % subbuf_size; - padding = buf->padding[read_subbuf]; - - if (read_subbuf == write_subbuf) { - if (read_offset + padding < write_offset) - avail = write_offset - (read_offset + padding); - } else - avail = (subbuf_size - padding) - read_offset; - - return avail; -} - -/** - * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position - * @buf: relay channel buffer - * - * If the @read_pos is in the middle of padding, return the - * position of the first actually available byte, otherwise - * return the original value. - */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) -{ - size_t read_subbuf, padding, padding_start, padding_end; - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t consumed = buf->subbufs_consumed % n_subbufs; - - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; - read_subbuf = read_pos / subbuf_size; - padding = buf->padding[read_subbuf]; - padding_start = (read_subbuf + 1) * subbuf_size - padding; - padding_end = (read_subbuf + 1) * subbuf_size; - if (read_pos >= padding_start && read_pos < padding_end) { - read_subbuf = (read_subbuf + 1) % n_subbufs; - read_pos = read_subbuf * subbuf_size; - } - - return read_pos; -} - -/** - * relay_file_read_end_pos - return the new read position - * @read_pos: file read position - * @buf: relay channel buffer - * @count: number of bytes to be read - */ -static size_t relay_file_read_end_pos(struct rchan_buf *buf, - size_t read_pos, - size_t count) -{ - size_t read_subbuf, padding, end_pos; - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - - read_subbuf = read_pos / subbuf_size; - padding = buf->padding[read_subbuf]; - if (read_pos % subbuf_size + count + padding == subbuf_size) - end_pos = (read_subbuf + 1) * subbuf_size; - else - end_pos = read_pos + count; - if (end_pos >= subbuf_size * n_subbufs) - end_pos = 0; - - return end_pos; -} - -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc, - read_actor_t actor) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc, - read_actor_t actor); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_actor_t actor, - read_descriptor_t *desc) -{ - struct rchan_buf *buf = filp->private_data; - size_t read_start, avail; - int ret; - - if (!desc->count) - return 0; - - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); - do { - if (!relay_file_read_avail(buf, *ppos)) - break; - - read_start = relay_file_read_start_pos(*ppos, buf); - avail = relay_file_read_subbuf_avail(read_start, buf); - if (!avail) - break; - - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc, actor); - if (desc->error < 0) - break; - - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); - - return desc->written; -} - -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, - NULL, &desc); -} - -static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -{ - rbuf->bytes_consumed += bytes_consumed; - - if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { - relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); - rbuf->bytes_consumed %= rbuf->chan->subbuf_size; - } -} - -static void relay_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct rchan_buf *rbuf; - - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); -} - -static struct pipe_buf_operations relay_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - -/* - * subbuf_splice_actor - splice up to one subbuf's worth of data - */ -static int subbuf_splice_actor(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags, - int *nonpad_ret) -{ - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; - struct rchan_buf *rbuf = in->private_data; - unsigned int subbuf_size = rbuf->chan->subbuf_size; - uint64_t pos = (uint64_t) *ppos; - uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; - size_t read_start = (size_t) do_div(pos, alloc_size); - size_t read_subbuf = read_start / subbuf_size; - size_t padding = rbuf->padding[read_subbuf]; - size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .partial = partial, - .flags = flags, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - - if (rbuf->subbufs_produced == rbuf->subbufs_consumed) - return 0; - - /* - * Adjust read len, if longer than what is available - */ - if (len > (subbuf_size - read_start % subbuf_size)) - len = subbuf_size - read_start % subbuf_size; - - subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; - pidx = (read_start / PAGE_SIZE) % subbuf_pages; - poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; - unsigned int cur_pos = read_start + total_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; - - this_end = cur_pos + this_len; - if (this_end >= nonpad_end) { - this_len = nonpad_end - cur_pos; - private = this_len + padding; - } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; - - len -= this_len; - total_len += this_len; - poff = 0; - pidx = (pidx + 1) % subbuf_pages; - - if (this_end >= nonpad_end) { - spd.nr_pages++; - break; - } - } - - if (!spd.nr_pages) - return 0; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - return ret; - - if (read_start + ret == nonpad_end) - ret += padding; - - return ret; -} - -static ssize_t relay_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - ssize_t spliced; - int ret; - int nonpad_ret = 0; - - ret = 0; - spliced = 0; - - while (len && !spliced) { - ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); - if (ret < 0) - break; - else if (!ret) { - if (spliced) - break; - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } - } - - *ppos += ret; - if (ret > len) - len = 0; - else - len -= ret; - spliced += nonpad_ret; - nonpad_ret = 0; - } - - if (spliced) - return spliced; - - return ret; -} - -const struct file_operations relay_file_operations = { - .open = relay_file_open, - .poll = relay_file_poll, - .mmap = relay_file_mmap, - .read = relay_file_read, - .llseek = no_llseek, - .release = relay_file_release, - .splice_read = relay_file_splice_read, -}; -EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov - * - */ - -#include -#include -#include -#include -#include -#include -#include - -void res_counter_init(struct res_counter *counter) -{ - spin_lock_init(&counter->lock); - counter->limit = (unsigned long long)LLONG_MAX; -} - -int res_counter_charge_locked(struct res_counter *counter, unsigned long val) -{ - if (counter->usage + val > counter->limit) { - counter->failcnt++; - return -ENOMEM; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return 0; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val) -{ - int ret; - unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - ret = res_counter_charge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); - return ret; -} - -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; -} - -void res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - res_counter_uncharge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); -} - - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res) -{ - char *end; - /* FIXME - make memparse() take const char* args */ - *res = memparse((char *)buf, &end); - if (*end != '\0') - return -EINVAL; - - *res = PAGE_ALIGN(*res); - return 0; -} - -int res_counter_write(struct res_counter *counter, int member, - const char *buf, write_strategy_fn write_strategy) -{ - char *end; - unsigned long flags; - unsigned long long tmp, *val; - - if (write_strategy) { - if (write_strategy(buf, &tmp)) - return -EINVAL; - } else { - tmp = simple_strtoull(buf, &end, 10); - if (*end != '\0') - return -EINVAL; - } - spin_lock_irqsave(&counter->lock, flags); - val = res_counter_member(counter, member); - *val = tmp; - spin_unlock_irqrestore(&counter->lock, flags); - return 0; -} -/* - * linux/kernel/resource.c - * - * Copyright (C) 1999 Linus Torvalds - * Copyright (C) 1999 Martin Mares - * - * Arbitrary resource management. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -struct resource ioport_resource = { - .name = "PCI IO", - .start = 0, - .end = IO_SPACE_LIMIT, - .flags = IORESOURCE_IO, -}; -EXPORT_SYMBOL(ioport_resource); - -struct resource iomem_resource = { - .name = "PCI mem", - .start = 0, - .end = -1, - .flags = IORESOURCE_MEM, -}; -EXPORT_SYMBOL(iomem_resource); - -static DEFINE_RWLOCK(resource_lock); - -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct resource *p = v; - (*pos)++; - if (p->child) - return p->child; - while (!p->sibling && p->parent) - p = p->parent; - return p->sibling; -} - -static void *r_start(struct seq_file *m, loff_t *pos) - __acquires(resource_lock) -{ - struct resource *p = m->private; - loff_t l = 0; - read_lock(&resource_lock); - for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) - ; - return p; -} - -static void r_stop(struct seq_file *m, void *v) - __releases(resource_lock) -{ - read_unlock(&resource_lock); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct resource *root = m->private; - struct resource *r = v, *p; - int width = root->end < 0x10000 ? 4 : 8; - int depth; - - for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) - if (p->parent == root) - break; - seq_printf(m, "%*s%0*llx-%0*llx : %s\n", - depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, - r->name ? r->name : ""); - return 0; -} - -static const struct seq_operations resource_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init ioresources_init(void) -{ - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); - return 0; -} -__initcall(ioresources_init); - -#endif /* CONFIG_PROC_FS */ - -/* Return the conflict entry if you can't request it */ -static struct resource * __request_resource(struct resource *root, struct resource *new) -{ - resource_size_t start = new->start; - resource_size_t end = new->end; - struct resource *tmp, **p; - - if (end < start) - return root; - if (start < root->start) - return root; - if (end > root->end) - return root; - p = &root->child; - for (;;) { - tmp = *p; - if (!tmp || tmp->start > end) { - new->sibling = tmp; - *p = new; - new->parent = root; - return NULL; - } - p = &tmp->sibling; - if (tmp->end < start) - continue; - return tmp; - } -} - -static int __release_resource(struct resource *old) -{ - struct resource *tmp, **p; - - p = &old->parent->child; - for (;;) { - tmp = *p; - if (!tmp) - break; - if (tmp == old) { - *p = tmp->sibling; - old->parent = NULL; - return 0; - } - p = &tmp->sibling; - } - return -EINVAL; -} - -/** - * request_resource - request and reserve an I/O or memory resource - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * - * Returns 0 for success, negative error code on error. - */ -int request_resource(struct resource *root, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __request_resource(root, new); - write_unlock(&resource_lock); - return conflict ? -EBUSY : 0; -} - -EXPORT_SYMBOL(request_resource); - -/** - * release_resource - release a previously reserved resource - * @old: resource pointer - */ -int release_resource(struct resource *old) -{ - int retval; - - write_lock(&resource_lock); - retval = __release_resource(old); - write_unlock(&resource_lock); - return retval; -} - -EXPORT_SYMBOL(release_resource); - -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) -/* - * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. - * If found, returns 0, res is overwritten, if not found, returns -1. - */ -static int find_next_system_ram(struct resource *res) -{ - resource_size_t start, end; - struct resource *p; - - BUG_ON(!res); - - start = res->start; - end = res->end; - BUG_ON(start >= end); - - read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ - if (p->flags != res->flags) - continue; - if (p->start > end) { - p = NULL; - break; - } - if ((p->end >= start) && (p->start < end)) - break; - } - read_unlock(&resource_lock); - if (!p) - return -1; - /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; - return 0; -} -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) -{ - struct resource res; - unsigned long pfn, len; - u64 orig_end; - int ret = -1; - res.start = (u64) start_pfn << PAGE_SHIFT; - res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { - pfn = (unsigned long)(res.start >> PAGE_SHIFT); - len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); - ret = (*func)(pfn, len, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; -} - -#endif - -/* - * Find empty slot in the resource tree given range and alignment. - */ -static int find_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), - void *alignf_data) -{ - struct resource *this = root->child; - - new->start = root->start; - /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to new->end below would cause an underflow. - */ - if (this && this->start == 0) { - new->start = this->end + 1; - this = this->sibling; - } - for(;;) { - if (this) - new->end = this->start - 1; - else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); - if (alignf) - alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; - return 0; - } - if (!this) - break; - new->start = this->end + 1; - this = this->sibling; - } - return -EBUSY; -} - -/** - * allocate_resource - allocate empty slot in the resource tree given range & alignment - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate - * @align: alignment requested, in bytes - * @alignf: alignment function, optional, called if not NULL - * @alignf_data: arbitrary data to pass to the @alignf function - */ -int allocate_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), - void *alignf_data) -{ - int err; - - write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); - if (err >= 0 && __request_resource(root, new)) - err = -EBUSY; - write_unlock(&resource_lock); - return err; -} - -EXPORT_SYMBOL(allocate_resource); - -/* - * Insert a resource into the resource tree. If successful, return NULL, - * otherwise return the conflicting resource (compare to __request_resource()) - */ -static struct resource * __insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *first, *next; - - for (;; parent = first) { - first = __request_resource(parent, new); - if (!first) - return first; - - if (first == parent) - return first; - - if ((first->start > new->start) || (first->end < new->end)) - break; - if ((first->start == new->start) && (first->end == new->end)) - break; - } - - for (next = first; ; next = next->sibling) { - /* Partial overlap? Bad, and unfixable */ - if (next->start < new->start || next->end > new->end) - return next; - if (!next->sibling) - break; - if (next->sibling->start > new->end) - break; - } - - new->parent = parent; - new->sibling = next->sibling; - new->child = first; - - next->sibling = NULL; - for (next = first; next; next = next->sibling) - next->parent = new; - - if (parent->child == first) { - parent->child = new; - } else { - next = parent->child; - while (next->sibling != first) - next = next->sibling; - next->sibling = new; - } - return NULL; -} - -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. - */ -int insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __insert_resource(parent, new); - write_unlock(&resource_lock); - return conflict ? -EBUSY : 0; -} - -/** - * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @root: root resource descriptor - * @new: new resource to insert - * - * Insert a resource into the resource tree, possibly expanding it in order - * to make it encompass any conflicting resources. - */ -void insert_resource_expand_to_fit(struct resource *root, struct resource *new) -{ - if (new->parent) - return; - - write_lock(&resource_lock); - for (;;) { - struct resource *conflict; - - conflict = __insert_resource(root, new); - if (!conflict) - break; - if (conflict == root) - break; - - /* Ok, expand resource to cover the conflict, then try again .. */ - if (conflict->start < new->start) - new->start = conflict->start; - if (conflict->end > new->end) - new->end = conflict->end; - - printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); - } - write_unlock(&resource_lock); -} - -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) -{ - struct resource *tmp, *parent = res->parent; - resource_size_t end = start + size - 1; - int result = -EBUSY; - - write_lock(&resource_lock); - - if ((start < parent->start) || (end > parent->end)) - goto out; - - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - - if (res->sibling && (res->sibling->start <= end)) - goto out; - - tmp = parent->child; - if (tmp != res) { - while (tmp->sibling != res) - tmp = tmp->sibling; - if (start <= tmp->end) - goto out; - } - - res->start = start; - res->end = end; - result = 0; - - out: - write_unlock(&resource_lock); - return result; -} - -EXPORT_SYMBOL(adjust_resource); - -/** - * resource_alignment - calculate resource's alignment - * @res: resource pointer - * - * Returns alignment on success, 0 (invalid alignment) on failure. - */ -resource_size_t resource_alignment(struct resource *res) -{ - switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { - case IORESOURCE_SIZEALIGN: - return resource_size(res); - case IORESOURCE_STARTALIGN: - return res->start; - default: - return 0; - } -} - -/* - * This is compatibility stuff for IO resources. - * - * Note how this, unlike the above, knows about - * the IO flag meanings (busy etc). - * - * request_region creates a new busy region. - * - * check_region returns non-zero if the area is already busy. - * - * release_region releases a matching busy region. - */ - -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - */ -struct resource * __request_region(struct resource *parent, - resource_size_t start, resource_size_t n, - const char *name) -{ - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - - write_lock(&resource_lock); - - for (;;) { - struct resource *conflict; - - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } - - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; - break; - } - write_unlock(&resource_lock); - } - return res; -} -EXPORT_SYMBOL(__request_region); - -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region"); - if (!res) - return -EBUSY; - - release_resource(res); - kfree(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - -/** - * __release_region - release a previously reserved resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * The described resource region must match a currently busy region. - */ -void __release_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource **p; - resource_size_t end; - - p = &parent->child; - end = start + n - 1; - - write_lock(&resource_lock); - - for (;;) { - struct resource *res = *p; - - if (!res) - break; - if (res->start <= start && res->end >= end) { - if (!(res->flags & IORESOURCE_BUSY)) { - p = &res->child; - continue; - } - if (res->start != start || res->end != end) - break; - *p = res->sibling; - write_unlock(&resource_lock); - kfree(res); - return; - } - p = &res->sibling; - } - - write_unlock(&resource_lock); - - printk(KERN_WARNING "Trying to free nonexistent resource " - "<%016llx-%016llx>\n", (unsigned long long)start, - (unsigned long long)end); -} -EXPORT_SYMBOL(__release_region); - -/* - * Managed region resource - */ -struct region_devres { - struct resource *parent; - resource_size_t start; - resource_size_t n; -}; - -static void devm_region_release(struct device *dev, void *res) -{ - struct region_devres *this = res; - - __release_region(this->parent, this->start, this->n); -} - -static int devm_region_match(struct device *dev, void *res, void *match_data) -{ - struct region_devres *this = res, *match = match_data; - - return this->parent == match->parent && - this->start == match->start && this->n == match->n; -} - -struct resource * __devm_request_region(struct device *dev, - struct resource *parent, resource_size_t start, - resource_size_t n, const char *name) -{ - struct region_devres *dr = NULL; - struct resource *res; - - dr = devres_alloc(devm_region_release, sizeof(struct region_devres), - GFP_KERNEL); - if (!dr) - return NULL; - - dr->parent = parent; - dr->start = start; - dr->n = n; - - res = __request_region(parent, start, n, name); - if (res) - devres_add(dev, dr); - else - devres_free(dr); - - return res; -} -EXPORT_SYMBOL(__devm_request_region); - -void __devm_release_region(struct device *dev, struct resource *parent, - resource_size_t start, resource_size_t n) -{ - struct region_devres match_data = { parent, start, n }; - - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, - &match_data)); -} -EXPORT_SYMBOL(__devm_release_region); - -/* - * Called from init/main.c to reserve IO ports. - */ -#define MAXRESERVE 4 -static int __init reserve_setup(char *str) -{ - static int reserved; - static struct resource reserve[MAXRESERVE]; - - for (;;) { - unsigned int io_start, io_num; - int x = reserved; - - if (get_option (&str, &io_start) != 2) - break; - if (get_option (&str, &io_num) == 0) - break; - if (x < MAXRESERVE) { - struct resource *res = reserve + x; - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; - res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) - reserved = x+1; - } - } - return 1; -} - -__setup("reserve=", reserve_setup); -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex_common.h" - -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk(""); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - -void rt_mutex_debug_task_free(struct task_struct *task) -{ - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); -} - -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!rt_trace_on || detect || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !rt_trace_on) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - TRACE_OFF_NOLOCK(); - - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk( "--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); - local_irq_disable(); -} - -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); -} - -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); - waiter->deadlock_task_pid = NULL; -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; -} - -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; - int event; - struct sys_device sysdev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - - if (!lockwakeup && td->bkl == 4) { - unlock_kernel(); - td->bkl = 0; - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; - lock_kernel(); - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; - unlock_kernel(); - td->bkl = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKBKL: - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - case RTTEST_LOCKBKL: - return; - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, sysdev); - tid = td->sysdev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, sysdev); - tsk = threads[td->sysdev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->sysdev.id].owner); - - return curr - buf; -} - -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); - -static struct sysdev_class rttest_sysclass = { - .name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].sysdev.cls = &rttest_sysclass; - thread_data[id].sysdev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return sysdev_register(&thread_data[id].sysdev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = sysdev_class_register(&rttest_sysclass); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); - if (ret) - break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); -/* - * RT-Mutexes: simple blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner. - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner - * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt - * Copyright (C) 2006 Esben Nielsen - * - * See Documentation/rt-mutex-design.txt for details. - */ -#include -#include -#include -#include - -#include "rtmutex_common.h" - -/* - * lock->owner state tracking: - * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. - * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. - * - * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. - * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. - */ - -static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) -{ - unsigned long val = (unsigned long)owner | mask; - - if (rt_mutex_has_waiters(lock)) - val |= RT_MUTEX_HAS_WAITERS; - - lock->owner = (struct task_struct *)val; -} - -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); -} - -/* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/* - * Calculate task priority from the waiter list priority - * - * Return task->normal_prio when the waiter list is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->pi_list_entry.prio, - task->normal_prio); -} - -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -static void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); -} - -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -/* - * Adjust the priority chain. Also used for deadlock detection. - * Decreases task's usage by one - may thus free the task. - * Returns 0 or -EDEADLK. - */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, - struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) -{ - struct rt_mutex *lock; - struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; - unsigned long flags; - - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); - - /* - * The (de)boosting is a step by step approach with a lot of - * pitfalls. We want this to be preemptible and we want hold a - * maximum of two locks per step. So we have to check - * carefully whether things change under us. - */ - again: - if (++depth > max_lock_depth) { - static int prev_max; - - /* - * Print this only once. If the admin changes the limit, - * print a new message when reaching the limit again. - */ - if (prev_max != max_lock_depth) { - prev_max = max_lock_depth; - printk(KERN_WARNING "Maximum lock depth %d reached " - "task: %s (%d)\n", max_lock_depth, - top_task->comm, task_pid_nr(top_task)); - } - put_task_struct(task); - - return deadlock_detect ? -EDEADLK : 0; - } - retry: - /* - * Task can not go away as we did a get_task() before ! - */ - spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - /* - * Check whether the end of the boosting chain has been - * reached or the state of the chain has changed while we - * dropped the locks. - */ - if (!waiter || !waiter->task) - goto out_unlock_pi; - - /* - * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: - */ - if (orig_waiter && !orig_waiter->task) - goto out_unlock_pi; - - /* - * Drop out, when the task has no waiters. Note, - * top_waiter can be NULL, when we are in the deboosting - * mode! - */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; - - /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. - */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) - goto out_unlock_pi; - - lock = waiter->lock; - if (!spin_trylock(&lock->wait_lock)) { - spin_unlock_irqrestore(&task->pi_lock, flags); - cpu_relax(); - goto retry; - } - - /* Deadlock detection */ - if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; - goto out_unlock_pi; - } - - top_waiter = rt_mutex_top_waiter(lock); - - /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); - - /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); - put_task_struct(task); - - /* Grab the next task */ - task = rt_mutex_owner(lock); - get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); - - if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - - } else if (top_waiter == waiter) { - /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); - waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - } - - spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); - - if (!detect_deadlock && waiter != top_waiter) - goto out_put_task; - - goto again; - - out_unlock_pi: - spin_unlock_irqrestore(&task->pi_lock, flags); - out_put_task: - put_task_struct(task); - - return ret; -} - -/* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == current) - return 1; - - spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be current: - */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); - } - return 1; -} - -/* - * Try to take an rt-mutex - * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * - * Must be called with lock->wait_lock held. - */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) -{ - /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. - * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. - * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. - */ - mark_rt_mutex_waiters(lock); - - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) - return 0; - - /* We got the lock. */ - debug_rt_mutex_lock(lock); - - rt_mutex_set_owner(lock, current, 0); - - rt_mutex_deadlock_account_lock(lock, current); - - return 1; -} - -/* - * Task blocks on lock. - * - * Prepare waiter and propagate pi chain - * - * This must be called with lock->wait_lock held. - */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - int detect_deadlock) -{ - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; - int chain_walk = 0, res; - - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; - waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - current->pi_blocked_on = waiter; - - spin_unlock_irqrestore(¤t->pi_lock, flags); - - if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); - - __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) - chain_walk = 1; - - if (!chain_walk) - return 0; - - /* - * The owner can't disappear while holding a lock, - * so the owner struct is protected by wait_lock. - * Gets dropped in rt_mutex_adjust_prio_chain()! - */ - get_task_struct(owner); - - spin_unlock(&lock->wait_lock); - - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); - - spin_lock(&lock->wait_lock); - - return res; -} - -/* - * Wake up the next waiter on the lock. - * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. - * - * Called with lock->wait_lock held. - */ -static void wakeup_next_waiter(struct rt_mutex *lock) -{ - struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; - unsigned long flags; - - spin_lock_irqsave(¤t->pi_lock, flags); - - waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); - - /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. - */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - - spin_unlock_irqrestore(¤t->pi_lock, flags); - - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); -} - -/* - * Remove a waiter from a lock - * - * Must be called with lock->wait_lock held - */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - int first = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; - int chain_walk = 0; - - spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; - current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); - - if (first && owner != current) { - - spin_lock_irqsave(&owner->pi_lock, flags); - - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); - } - __rt_mutex_adjust_prio(owner); - - if (owner->pi_blocked_on) - chain_walk = 1; - - spin_unlock_irqrestore(&owner->pi_lock, flags); - } - - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - - if (!chain_walk) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - spin_unlock(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - - spin_lock(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - unsigned long flags; - - spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { - spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - - spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); -} - -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - struct rt_mutex_waiter waiter; - int ret = 0; - - debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; - - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - /* - * waiter.task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter.task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - - spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(&waiter); - - if (waiter.task) - schedule_rt_mutex(lock); - - spin_lock(&lock->wait_lock); - set_current_state(state); - } - - set_current_state(TASK_RUNNING); - - if (unlikely(waiter.task)) - remove_waiter(lock, &waiter); - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - spin_unlock(&lock->wait_lock); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - - debug_rt_mutex_free_waiter(&waiter); - - return ret; -} - -/* - * Slow path try-lock function: - */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) -{ - int ret = 0; - - spin_lock(&lock->wait_lock); - - if (likely(rt_mutex_owner(lock) != current)) { - - ret = try_to_take_rt_mutex(lock); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } - - spin_unlock(&lock->wait_lock); - - return ret; -} - -/* - * Slow path to release a rt-mutex: - */ -static void __sched -rt_mutex_slowunlock(struct rt_mutex *lock) -{ - spin_lock(&lock->wait_lock); - - debug_rt_mutex_unlock(lock); - - rt_mutex_deadlock_account_unlock(current); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - spin_unlock(&lock->wait_lock); - return; - } - - wakeup_next_waiter(lock); - - spin_unlock(&lock->wait_lock); - - /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); -} - -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, NULL, detect_deadlock); -} - -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, timeout, detect_deadlock); -} - -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 1; - } - return slowfn(lock); -} - -static inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) - rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); -} - -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - might_sleep(); - - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); - -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); - -/** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); - -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * Returns 1 on success and 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -} -EXPORT_SYMBOL_GPL(rt_mutex_trylock); - -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) -{ - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_unlock); - -/*** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void rt_mutex_destroy(struct rt_mutex *lock) -{ - WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif -} - -EXPORT_SYMBOL_GPL(rt_mutex_destroy); - -/** - * __rt_mutex_init - initialize the rt lock - * - * @lock: the rt lock to be initialized - * - * Initialize the rt lock to unlocked state. - * - * Initializing of a locked rt lock is not allowed - */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - lock->owner = NULL; - spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list, &lock->wait_lock); - - debug_rt_mutex_init(lock, name); -} -EXPORT_SYMBOL_GPL(__rt_mutex_init); - -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_init(lock, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); - rt_mutex_deadlock_account_lock(lock, proxy_owner); -} - -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); - rt_mutex_deadlock_account_unlock(proxy_owner); -} - -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; - - return rt_mutex_top_waiter(lock)->task; -} -/* kernel/rwsem.c: R/W semaphores, public implementation - * - * Written by David Howells (dhowells@redhat.com). - * Derived from asm-i386/semaphore.h - */ - -#include -#include -#include -#include -#include - -#include -#include - -/* - * lock for reading - */ -void __sched down_read(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read); - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int down_read_trylock(struct rw_semaphore *sem) -{ - int ret = __down_read_trylock(sem); - - if (ret == 1) - rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_read_trylock); - -/* - * lock for writing - */ -void __sched down_write(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write); - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int down_write_trylock(struct rw_semaphore *sem) -{ - int ret = __down_write_trylock(sem); - - if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_write_trylock); - -/* - * release a read lock - */ -void up_read(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_read(sem); -} - -EXPORT_SYMBOL(up_read); - -/* - * release a write lock - */ -void up_write(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_write(sem); -} - -EXPORT_SYMBOL(up_write); - -/* - * downgrade write lock to read lock - */ -void downgrade_write(struct rw_semaphore *sem) -{ - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ - __downgrade_write(sem); -} - -EXPORT_SYMBOL(downgrade_write); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void down_read_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read_nested); - -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - -void down_write_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write_nested); - -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - -#endif - - -/* - * kernel/sched.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "sched_cpupri.h" - -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -#ifdef CONFIG_SMP -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - -static inline int rt_policy(int policy) -{ - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -static struct rt_bandwidth def_rt_bandwidth; - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; -} - -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - ktime_t now; - - if (rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start(&rt_b->rt_period_timer, - rt_b->rt_period_timer.expires, - HRTIMER_MODE_ABS); - } - spin_unlock(&rt_b->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} -#endif - -/* - * sched_domains_mutex serializes calls to arch_init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -#ifdef CONFIG_GROUP_SCHED - -#include - -struct cfs_rq; - -static LIST_HEAD(task_groups); - -/* task group related information */ -struct task_group { -#ifdef CONFIG_CGROUP_SCHED - struct cgroup_subsys_state css; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; -}; - -#ifdef CONFIG_USER_SCHED - -/* - * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. - */ -struct task_group root_task_group; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Default task group's sched entity on each cpu */ -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); -/* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ -#define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ -static DEFINE_SPINLOCK(task_group_lock); - -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else /* !CONFIG_USER_SCHED */ -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif /* CONFIG_USER_SCHED */ - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) - -static int init_task_group_load = INIT_TASK_GROUP_LOAD; -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group init_task_group; - -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_USER_SCHED - tg = p->user->tg; -#elif defined(CONFIG_CGROUP_SCHED) - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_GROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running; - - u64 exec_clock; - u64 min_vruntime; - u64 pair_start; - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next; - - unsigned long nr_spread_over; - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * this cpu's part of tg->shares - */ - unsigned long shares; - - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; -#endif -#endif -}; - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - int overloaded; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; - struct sched_rt_entity *rt_se; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - cpumask_t span; - cpumask_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_t rto_mask; - atomic_t rto_count; -#ifdef CONFIG_SMP - struct cpupri cpupri; -#endif -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned char idle_at_tick; -#ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; - unsigned char in_nohz_recently; -#endif - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - /* cpu of this runqueue: */ - int cpu; - int online; - - unsigned long avg_load_per_task; - - struct task_struct *migration_thread; - struct list_head migration_queue; -#endif - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - - /* sys_sched_yield() stats */ - unsigned int yld_exp_empty; - unsigned int yld_act_empty; - unsigned int yld_both_empty; - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; -#endif -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) -{ - rq->curr->sched_class->check_preempt_curr(rq, p, sync); -} - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) - -static inline void update_rq_clock(struct rq *rq) -{ - rq->clock = sched_clock_cpu(cpu_of(rq)); -} - -/* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/** - * runqueue_is_locked - * - * Returns true if the current cpu runqueue is locked. - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -int runqueue_is_locked(void) -{ - int cpu = get_cpu(); - struct rq *rq = cpu_rq(cpu); - int ret; - - ret = spin_is_locked(&rq->lock); - put_cpu(); - return ret; -} - -/* - * Debugging: various feature bits - */ - -#define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "sched_features.h" -}; - -#undef SCHED_FEAT - -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | - -const_debug unsigned int sysctl_sched_features = -#include "sched_features.h" - 0; - -#undef SCHED_FEAT - -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static __read_mostly char *sched_feat_names[] = { -#include "sched_features.h" - NULL -}; - -#undef SCHED_FEAT - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; - int i; - - for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; - } - - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); - } - - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int neg = 0; - int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - if (strncmp(buf, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; sched_feat_names[i]; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) - sysctl_sched_features &= ~(1UL << i); - else - sysctl_sched_features |= (1UL << i); - break; - } - } - - if (!sched_feat_names[i]) - return -EINVAL; - - filp->f_pos += cnt; - - return cnt; -} - -static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); - -#endif - -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -/* - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; - -/* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; - -/* - * period over which we measure -rt task cpu usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - -static __read_mostly int scheduler_running; - -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} - -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline int task_running(struct rq *rq, struct task_struct *p) -{ - return task_current(rq, p); -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->oncpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); -#else - spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->oncpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->oncpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - for (;;) { - struct rq *rq = task_rq(p); - spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - spin_unlock_irqrestore(&rq->lock, *flags); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - spin_unlock(&rq->lock); -} - -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) - __releases(rq->lock) -{ - spin_unlock_irqrestore(&rq->lock, *flags); -} - -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - -#ifdef CONFIG_SCHED_HRTICK -/* - * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. - */ - -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - -static void hrtick_clear(struct rq *rq) -{ - if (hrtimer_active(&rq->hrtick_timer)) - hrtimer_cancel(&rq->hrtick_timer); -} - -/* - * High-resolution timer tick. - * Runs from hardirq context with interrupts disabled. - */ -static enum hrtimer_restart hrtick(struct hrtimer *timer) -{ - struct rq *rq = container_of(timer, struct rq, hrtick_timer); - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - spin_lock(&rq->lock); - update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); - - return HRTIMER_NORESTART; -} - -#ifdef CONFIG_SMP -/* - * called from hardirq (IPI) context - */ -static void __hrtick_start(void *arg) -{ - struct rq *rq = arg; - - spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); - rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); -} - -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay) -{ - struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - - timer->expires = time; - - if (rq == this_rq()) { - hrtimer_restart(timer); - } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } -} - -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -static void hrtick_start(struct rq *rq, u64 delay) -{ - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); -} - -static void init_hrtick(void) -{ -} -#endif /* CONFIG_SMP */ - -static void init_rq_hrtick(struct rq *rq) -{ -#ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; -#endif - - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; -} -#else -static inline void hrtick_clear(struct rq *rq) -{ -} - -static inline void init_rq_hrtick(struct rq *rq) -{ -} - -static inline void init_hrtick(void) -{ -} -#endif - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) -{ - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; - - set_tsk_thread_flag(p, TIF_NEED_RESCHED); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} - -static void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - if (!spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); -} - -#ifdef CONFIG_NO_HZ -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (cpu == smp_processor_id()) - return; - - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); -} -#endif /* CONFIG_NO_HZ */ - -#else /* !CONFIG_SMP */ -static void resched_task(struct task_struct *p) -{ - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif /* CONFIG_SMP */ - -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) - lw->inv_weight = 1; - else - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) - / (lw->weight+1); - } - - tmp = (u64)delta_exec * weight; - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); - -/* - * runqueue iterator, to support SMP load-balancing between different - * scheduling classes, without having to expose their internal data - * structures to the load-balancing proper: - */ -struct rq_iterator { - void *arg; - struct task_struct *(*start)(void *); - struct task_struct *(*next)(void *); -}; - -#ifdef CONFIG_SMP -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator); - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator); -#endif - -#ifdef CONFIG_CGROUP_CPUACCT -static void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) -{ - struct task_group *parent, *child; - - rcu_read_lock(); - parent = &root_task_group; -down: - (*down)(parent, cpu, sd); - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - (*up)(parent, cpu, sd); - - child = parent; - parent = parent->parent; - if (parent) - goto up; - rcu_read_unlock(); -} - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) -{ - int boost = 0; - unsigned long shares; - unsigned long rq_weight; - - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - if (unlikely(rq_weight > sd_rq_weight)) - rq_weight = sd_rq_weight; - - /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * - */ - shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); - - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; - - __set_se_shares(tg->se[cpu], shares); -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; - shares += tg->cfs_rq[i]->shares; - } - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - if (!rq_weight) - rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; -} - -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ -} - -static void update_shares(struct sched_domain *sd) -{ - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); - } -} - -static void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ - spin_unlock(&rq->lock); - update_shares(sd); - spin_lock(&rq->lock); -} - -static void update_h_load(int cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); -} - -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - -static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) -{ -} - -#endif - -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - -#include "sched_stats.h" -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - -#define sched_class_highest (&rt_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -static void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} - -static void set_load_weight(struct task_struct *p) -{ - if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; - return; - } - - /* - * SCHED_IDLE tasks get minimal weight: - */ - if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; - return; - } - - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; -} - -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) -{ - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); - p->se.on_rq = 1; -} - -static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) -{ - if (sleep && p->se.last_wakeup) { - update_avg(&p->se.avg_overlap, - p->se.sum_exec_runtime - p->se.last_wakeup); - p->se.last_wakeup = 0; - } - - sched_info_dequeued(p); - p->sched_class->dequeue_task(rq, p, sleep); - p->se.on_rq = 0; -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; -} - -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - int prio; - - if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/* - * activate_task - move a task to the runqueue. - */ -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, wakeup); - inc_nr_running(rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - - dequeue_task(rq, p, sleep); - dec_nr_running(rq); -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio, int running) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); -} - -#ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) - return 1; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - - -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ - int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; - -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; - if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); - } -#endif - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; - - __set_task_cpu(p, new_cpu); -} - -struct migration_req { - struct list_head list; - - struct task_struct *task; - int dest_cpu; - - struct completion done; -}; - -/* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) -{ - struct rq *rq = task_rq(p); - - /* - * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. - */ - if (!p->se.on_rq && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); - return 0; - } - - init_completion(&req->done); - req->task = p; - req->dest_cpu = dest_cpu; - list_add(&req->list, &rq->migration_queue); - - return 1; -} - -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - int running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - on_rq = p->se.on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } - task_rq_unlock(rq, &flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it wa still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesnt have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - continue; - - local_group = cpu_isset(this_cpu, group->cpumask); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu_mask_nr(i, group->cpumask) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - cpumask_t span, tmpmask; - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - span = sd->span; - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - sd = NULL; - weight = cpus_weight(span); - for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - -#endif /* CONFIG_SMP */ - -/*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread - * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * returns failure only if the task is already active. - */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) -{ - int cpu, orig_cpu, this_cpu, success = 0; - unsigned long flags; - long old_state; - struct rq *rq; - - if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; - -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE)) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - update_shares(sd); - break; - } - } - } -#endif - - smp_wmb(); - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - - if (p->se.on_rq) - goto out_running; - - cpu = task_cpu(p); - orig_cpu = cpu; - this_cpu = smp_processor_id(); - -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; - - cpu = p->sched_class->select_task_rq(p, sync); - if (cpu != orig_cpu) { - set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } - -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ - -out_activate: -#endif /* CONFIG_SMP */ - schedstat_inc(p, se.nr_wakeups); - if (sync) - schedstat_inc(p, se.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.nr_wakeups_local); - else - schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); - activate_task(rq, p, 1); - success = 1; - -out_running: - trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p, sync); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); -#endif -out: - current->se.last_wakeup = current->se.sum_exec_runtime; - - task_rq_unlock(rq, &flags); - - return success; -} - -int wake_up_process(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_ALL, 0); -} -EXPORT_SYMBOL(wake_up_process); - -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - * - * __sched_fork() is basic setup used by init_idle() too: - */ -static void __sched_fork(struct task_struct *p) -{ - p->se.exec_start = 0; - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->se.last_wakeup = 0; - p->se.avg_overlap = 0; - -#ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sum_sleep_runtime = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; - p->se.sleep_max = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.wait_max = 0; -#endif - - INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; -} - -/* - * fork()/clone()-time setup: - */ -void sched_fork(struct task_struct *p, int clone_flags) -{ - int cpu = get_cpu(); - - __sched_fork(p); - -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - - /* - * Make sure we do not leak PI boosting priority to the child: - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) - p->sched_class = &fair_sched_class; - -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (likely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; -#endif -#ifdef CONFIG_PREEMPT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif - put_cpu(); -} - -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) -{ - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - update_rq_clock(rq); - - p->prio = effective_prio(p); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } - trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p, 0); -#ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); -#endif - task_rq_unlock(rq, &flags); -} - -#ifdef CONFIG_PREEMPT_NOTIFIERS - -/** - * preempt_notifier_register - tell me when current is being being preempted & rescheduled - * @notifier: notifier struct to register - */ -void preempt_notifier_register(struct preempt_notifier *notifier) -{ - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -} -EXPORT_SYMBOL_GPL(preempt_notifier_register); - -/** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister - * - * This is safe to call from within a preemption notifier. - */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @prev: the current task that is being switched out - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) - __releases(rq->lock) -{ - struct mm_struct *mm = rq->prev_mm; - long prev_state; - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul - */ - prev_state = prev->state; - finish_arch_switch(prev); - finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (current->sched_class->post_schedule) - current->sched_class->post_schedule(rq); -#endif - - fire_sched_in_preempt_notifiers(current); - if (mm) - mmdrop(mm); - if (unlikely(prev_state == TASK_DEAD)) { - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } -} - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage void schedule_tail(struct task_struct *prev) - __releases(rq->lock) -{ - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); -} - -/* - * context_switch - switch to the new MM and the new - * thread's register state. - */ -static inline void -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - struct mm_struct *mm, *oldmm; - - prepare_task_switch(rq, prev, next); - trace_mark(kernel_sched_schedule, - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - prev->pid, next->pid, prev->state, - rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; - /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. - */ - arch_enter_lazy_cpu_mode(); - - if (unlikely(!mm)) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); - - if (unlikely(!prev->mm)) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; - } - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif - - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); -} - -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. - */ -unsigned long nr_running(void) -{ - unsigned long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; - - return sum; -} - -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - -unsigned long long nr_context_switches(void) -{ - int i; - unsigned long long sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; - - return sum; -} - -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); - - return sum; -} - -unsigned long nr_active(void) -{ - unsigned long i, running = 0, uninterruptible = 0; - - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } - - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; - - return running + uninterruptible; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - */ -static void update_cpu_load(struct rq *this_rq) -{ - unsigned long this_load = this_rq->load.weight; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; - } -} - -#ifdef CONFIG_SMP - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } - update_rq_clock(rq1); - update_rq_clock(rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - spin_unlock(&rq1->lock); - if (rq1 != rq2) - spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. - */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) -{ - struct migration_req req; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; - - /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { - /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = rq->migration_thread; - - get_task_struct(mt); - task_rq_unlock(rq, &flags); - wake_up_process(mt); - put_task_struct(mt); - wait_for_completion(&req.done); - - return; - } -out: - task_rq_unlock(rq, &flags); -} - -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) -{ - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - check_preempt_curr(this_rq, p, 0); -} - -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { - schedstat_inc(p, se.nr_failed_migrations_affine); - return 0; - } - *all_pinned = 0; - - if (task_running(rq, p)) { - schedstat_inc(p, se.nr_failed_migrations_running); - return 0; - } - - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ - - if (!task_hot(p, rq->clock, sd) || - sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, rq->clock, sd)) { - schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.nr_forced_migrations); - } -#endif - return 1; - } - - if (task_hot(p, rq->clock, sd)) { - schedstat_inc(p, se.nr_failed_migrations_hot); - return 0; - } - return 1; -} - -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct rq_iterator *iterator) -{ - int loops = 0, pulled = 0, pinned = 0; - struct task_struct *p; - long rem_load_move = max_load_move; - - if (max_load_move == 0) - goto out; - - pinned = 1; - - /* - * Start the load-balancing iterator: - */ - p = iterator->start(iterator->arg); -next: - if (!p || loops++ > sysctl_sched_nr_migrate) - goto out; - - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - p = iterator->next(iterator->arg); - goto next; - } - - pull_task(busiest, p, this_rq, this_cpu); - pulled++; - rem_load_move -= p->se.load.weight; - - /* - * We only want to steal up to the prescribed amount of weighted load. - */ - if (rem_load_move > 0) { - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; - p = iterator->next(iterator->arg); - goto next; - } -out: - /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_add(sd, lb_gained[idle], pulled); - - if (all_pinned) - *all_pinned = pinned; - - return max_load_move - rem_load_move; -} - -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) -{ - const struct sched_class *class = sched_class_highest; - unsigned long total_load_moved = 0; - int this_best_prio = this_rq->curr->prio; - - do { - total_load_moved += - class->load_balance(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); - class = class->next; - - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) - break; - - } while (class && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - -static int -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle, - struct rq_iterator *iterator) -{ - struct task_struct *p = iterator->start(iterator->arg); - int pinned = 0; - - while (p) { - if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - - return 1; - } - p = iterator->next(iterator->arg); - } - - return 0; -} - -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - const struct sched_class *class; - - for (class = sched_class_highest; class; class = class->next) - if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) - return 1; - - return 0; -} - -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) -{ - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; - unsigned long max_pull; - unsigned long busiest_load_per_task, busiest_nr_running; - unsigned long this_load_per_task, this_nr_running; - int load_idx, group_imb = 0; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance = 1; - unsigned long leader_nr_running = 0, min_load_per_task = 0; - unsigned long min_nr_running = ULONG_MAX; - struct sched_group *group_min = NULL, *group_leader = NULL; -#endif - - max_load = this_load = total_load = total_pwr = 0; - busiest_load_per_task = busiest_nr_running = 0; - this_load_per_task = this_nr_running = 0; - - if (idle == CPU_NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; - - do { - unsigned long load, group_capacity, max_cpu_load, min_cpu_load; - int local_group; - int i; - int __group_imb = 0; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_nr_running, sum_weighted_load; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; - - local_group = cpu_isset(this_cpu, group->cpumask); - - if (local_group) - balance_cpu = first_cpu(group->cpumask); - - /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; - sum_avg_load_per_task = avg_load_per_task = 0; - - max_cpu_load = 0; - min_cpu_load = ~0UL; - - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); - - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - } - - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); - - sum_avg_load_per_task += cpu_avg_load_per_task(i); - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { - *balance = 0; - goto ret; - } - - total_load += avg_load; - total_pwr += group->__cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); - - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); - - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - __group_imb = 1; - - group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - - if (local_group) { - this_load = avg_load; - this = group; - this_nr_running = sum_nr_running; - this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && - (sum_nr_running > group_capacity || __group_imb)) { - max_load = avg_load; - busiest = group; - busiest_nr_running = sum_nr_running; - busiest_load_per_task = sum_weighted_load; - group_imb = __group_imb; - } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (this_nr_running >= group_capacity || - !this_nr_running)) - power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!power_savings_balance || sum_nr_running >= group_capacity - || !sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { - group_min = group; - min_nr_running = sum_nr_running; - min_load_per_task = sum_weighted_load / - sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } - } -group_next: -#endif - group = group->next; - } while (group != sd->groups); - - if (!busiest || this_load >= max_load || busiest_nr_running == 0) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) - goto out_balanced; - - busiest_load_per_task /= busiest_nr_running; - if (group_imb) - busiest_load_per_task = min(busiest_load_per_task, avg_load); - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - if (max_load <= busiest_load_per_task) - goto out_balanced; - - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (max_load < avg_load) { - *imbalance = 0; - goto small_imbalance; - } - - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->__cpu_power, - (avg_load - this_load) * this->__cpu_power) - / SCHED_LOAD_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < busiest_load_per_task) { - unsigned long tmp, pwr_now, pwr_move; - unsigned int imbn; - -small_imbalance: - pwr_move = pwr_now = 0; - imbn = 2; - if (this_nr_running) { - this_load_per_task /= this_nr_running; - if (busiest_load_per_task > this_load_per_task) - imbn = 1; - } else - this_load_per_task = cpu_avg_load_per_task(this_cpu); - - if (max_load - this_load + 2*busiest_load_per_task >= - busiest_load_per_task * imbn) { - *imbalance = busiest_load_per_task; - return busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += busiest->__cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->__cpu_power * - min(this_load_per_task, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(busiest, - busiest_load_per_task * SCHED_LOAD_SCALE); - if (max_load > tmp) - pwr_move += busiest->__cpu_power * - min(busiest_load_per_task, max_load - tmp); - - /* Amount of load we'd add */ - if (max_load * busiest->__cpu_power < - busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(this, - max_load * busiest->__cpu_power); - else - tmp = sg_div_cpu_power(this, - busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += this->__cpu_power * - min(this_load_per_task, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = busiest_load_per_task; - } - - return busiest; - -out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto ret; - - if (this == group_leader && group_leader != group_min) { - *imbalance = min_load_per_task; - return group_min; - } -#endif -ret: - *imbalance = 0; - return NULL; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in group. - */ -static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) -{ - struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; - int i; - - for_each_cpu_mask_nr(i, group->cpumask) { - unsigned long wl; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); - wl = weighted_cpuload(i); - - if (rq->nr_running == 1 && wl > imbalance) - continue; - - if (wl > max_load) { - max_load = wl; - busiest = rq; - } - } - - return busiest; -} - -/* - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but - * so long as it is large enough. - */ -#define MAX_PINNED_INTERVAL 512 - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - */ -static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) -{ - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; - struct sched_group *group; - unsigned long imbalance; - struct rq *busiest; - unsigned long flags; - - cpus_setall(*cpus); - - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - - schedstat_inc(sd, lb_count[idle]); - -redo: - update_shares(sd); - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - cpus, balance); - - if (*balance == 0) - goto out_balanced; - - if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); - goto out_balanced; - } - - busiest = find_busiest_queue(group, idle, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[idle], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. ld_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - local_irq_save(flags); - double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &all_pinned); - double_rq_unlock(this_rq, busiest); - local_irq_restore(flags); - - /* - * some other cpu did the load balance for us. - */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); - - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) - goto redo; - goto out_balanced; - } - } - - if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; - - if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - - spin_lock_irqsave(&busiest->lock, flags); - - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu - */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); - all_pinned = 1; - goto out_one_pinned; - } - - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - spin_unlock_irqrestore(&busiest->lock, flags); - if (active_balance) - wake_up_process(busiest->migration_thread); - - /* - * We've kicked active balancing, reset the failure - * counter. - */ - sd->nr_balance_failed = sd->cache_nice_tries+1; - } - } else - sd->nr_balance_failed = 0; - - if (likely(!active_balance)) { - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * move_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - } - - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - - goto out; - -out_balanced: - schedstat_inc(sd, lb_balanced[idle]); - - sd->nr_balance_failed = 0; - -out_one_pinned: - /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) - sd->balance_interval *= 2; - - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; -out: - if (ld_moved) - update_shares(sd); - return ld_moved; -} - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - * - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). - * this_rq is locked. - */ -static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) -{ - struct sched_group *group; - struct rq *busiest = NULL; - unsigned long imbalance; - int ld_moved = 0; - int sd_idle = 0; - int all_pinned = 0; - - cpus_setall(*cpus); - - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - - schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); -redo: - update_shares_locked(this_rq, sd); - group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, cpus, NULL); - if (!group) { - schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - /* this_rq->clock is already updated */ - update_rq_clock(busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, CPU_NEWLY_IDLE, - &all_pinned); - double_unlock_balance(this_rq, busiest); - - if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) - goto redo; - } - } - - if (!ld_moved) { - schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - } else - sd->nr_balance_failed = 0; - - update_shares_locked(this_rq, sd); - return ld_moved; - -out_balanced: - schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - sd->nr_balance_failed = 0; - - return 0; -} - -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -static void idle_balance(int this_cpu, struct rq *this_rq) -{ - struct sched_domain *sd; - int pulled_task = -1; - unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; - - for_each_domain(this_cpu, sd) { - unsigned long interval; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (sd->flags & SD_BALANCE_NEWIDLE) - /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); - - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) - break; - } - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ - this_rq->next_balance = next_balance; - } -} - -/* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. - */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) -{ - int target_cpu = busiest_rq->push_cpu; - struct sched_domain *sd; - struct rq *target_rq; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - update_rq_clock(busiest_rq); - update_rq_clock(target_rq); - - /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) - break; - } - - if (likely(sd)) { - schedstat_inc(sd, alb_count); - - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - double_unlock_balance(busiest_rq, target_rq); -} - -#ifdef CONFIG_NO_HZ -static struct { - atomic_t load_balancer; - cpumask_t cpu_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, -}; - -/* - * This routine will try to nominate the ilb (idle load balancing) - * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. - * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. - * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() - */ -int select_nohz_load_balancer(int stop_tick) -{ - int cpu = smp_processor_id(); - - if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); - cpu_rq(cpu)->in_nohz_recently = 1; - - /* - * If we are going offline and still the leader, give up! - */ - if (!cpu_active(cpu) && - atomic_read(&nohz.load_balancer) == cpu) { - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) - BUG(); - return 0; - } - - /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } - - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) - return 1; - } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) - return 0; - - cpu_clear(cpu, nohz.cpu_mask); - - if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) - BUG(); - } - return 0; -} -#endif - -static DEFINE_SPINLOCK(balancing); - -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) -{ - int balance = 1; - struct rq *rq = cpu_rq(cpu); - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize; - cpumask_t tmp; - - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; - - need_serialize = sd->flags & SD_SERIALIZE; - - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { - /* - * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. - */ - idle = CPU_NOT_IDLE; - } - sd->last_balance = jiffies; - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!balance) - break; - } - - /* - * next_balance will be updated only when there is a need. - * When the cpu is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - rq->next_balance = next_balance; -} - -/* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. - */ -static void run_rebalance_domains(struct softirq_action *h) -{ - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_at_tick ? - CPU_IDLE : CPU_NOT_IDLE; - - rebalance_domains(this_cpu, idle); - -#ifdef CONFIG_NO_HZ - /* - * If this cpu is the owner for idle load balancing, then do the - * balancing on behalf of the other idle cpus whose ticks are - * stopped. - */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; - struct rq *rq; - int balance_cpu; - - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif -} - -/* - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. - */ -static inline void trigger_load_balance(struct rq *rq, int cpu) -{ -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = first_cpu(nohz.cpu_mask); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) - return; -#endif - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); -} - -#else /* CONFIG_SMP */ - -/* - * on UP we do not need to balance between CPUs: - */ -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - -#endif - -DEFINE_PER_CPU(struct kernel_stat, kstat); - -EXPORT_PER_CPU_SYMBOL(kstat); - -/* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - u64 ns, delta_exec; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; - if (task_current(rq, p)) { - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns += delta_exec; - } - task_rq_unlock(rq, &flags); - - return ns; -} - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time(struct task_struct *p, cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; - - p->utime = cputime_add(p->utime, cputime); - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) -{ - cputime64_t tmp; - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - - tmp = cputime_to_cputime64(cputime); - - p->utime = cputime_add(p->utime, cputime); - p->gtime = cputime_add(p->gtime, cputime); - - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); -} - -/* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); - cputime64_t tmp; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); - return; - } - - p->stime = cputime_add(p->stime, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->stimescaled = cputime_add(p->stimescaled, cputime); -} - -/* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(struct task_struct *p, cputime_t steal) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); - struct rq *rq = this_rq(); - - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); -} - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} -#else -cputime_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - clock_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - - return p->prev_stime; -} -#endif - -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; - - sched_clock_tick(); - - spin_lock(&rq->lock); - update_rq_clock(rq); - update_cpu_load(rq); - curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); - -#ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); - trigger_load_balance(rq, cpu); -#endif -} - -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - -void __kprobes add_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - preempt_count() += val; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -} -EXPORT_SYMBOL(add_preempt_count); - -void __kprobes sub_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif - - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) -{ - struct pt_regs *regs = get_irq_regs(); - - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); - - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); -} - -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) -{ - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) - __schedule_bug(prev); - - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif -} - -/* - * Pick up the highest-prio task: - */ -static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) -{ - const struct sched_class *class; - struct task_struct *p; - - /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: - */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; - } - - class = sched_class_highest; - for ( ; ; ) { - p = class->pick_next_task(rq); - if (p) - return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; - } -} - -/* - * schedule() is the main scheduler function. - */ -asmlinkage void __sched schedule(void) -{ - struct task_struct *prev, *next; - unsigned long *switch_count; - struct rq *rq; - int cpu; - -need_resched: - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_qsctr_inc(cpu); - prev = rq->curr; - switch_count = &prev->nivcsw; - - release_kernel_lock(prev); -need_resched_nonpreemptible: - - schedule_debug(prev); - - if (sched_feat(HRTICK)) - hrtick_clear(rq); - - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); - update_rq_clock(rq); - spin_lock(&rq->lock); - clear_tsk_need_resched(prev); - - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) - prev->state = TASK_RUNNING; - else - deactivate_task(rq, prev, 1); - switch_count = &prev->nvcsw; - } - -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); - - prev->sched_class->put_prev_task(rq, prev); - next = pick_next_task(rq, prev); - - if (likely(prev != next)) { - sched_info_switch(prev, next); - - rq->nr_switches++; - rq->curr = next; - ++*switch_count; - - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - } else - spin_unlock_irq(&rq->lock); - - if (unlikely(reacquire_kernel_lock(current) < 0)) - goto need_resched_nonpreemptible; - - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; -} -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); - - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (likely(ti->preempt_count || irqs_disabled())) - return; - - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); -} -EXPORT_SYMBOL(preempt_schedule); - -/* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. - */ -asmlinkage void __sched preempt_schedule_irq(void) -{ - struct thread_info *ti = current_thread_info(); - - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); - - do { - add_preempt_count(PREEMPT_ACTIVE); - local_irq_enable(); - schedule(); - local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); -} - -#endif /* CONFIG_PREEMPT */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, - void *key) -{ - return try_to_wake_up(curr->private, mode, sync); -} -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, sync, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} - -/** - * __wake_up_sync - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - */ -void -__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - unsigned long flags; - int sync = 1; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - sync = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, NULL); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -unsigned long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Returns: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irq(&x->wait.lock); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Returns: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - int ret = 1; - - spin_lock_irq(&x->wait.lock); - if (!x->done) - ret = 0; - spin_unlock_irq(&x->wait.lock); - return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance logic. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) -{ - unsigned long flags; - int oldprio, on_rq, running; - struct rq *rq; - const struct sched_class *prev_class = p->sched_class; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - - oldprio = p->prio; - on_rq = p->se.on_rq; - running = task_current(rq, p); - if (on_rq) - dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - if (rt_prio(prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - - p->prio = prio; - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) { - enqueue_task(rq, p, 0); - - check_class_changed(rq, p, prev_class, oldprio, running); - } - task_rq_unlock(rq, &flags); -} - -#endif - -void set_user_nice(struct task_struct *p, long nice) -{ - int old_prio, delta, on_rq; - unsigned long flags; - struct rq *rq; - - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = task_rq_lock(p, &flags); - update_rq_clock(rq); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: - */ - if (task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } - on_rq = p->se.on_rq; - if (on_rq) - dequeue_task(rq, p, 0); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); - delta = p->prio - old_prio; - - if (on_rq) { - enqueue_task(rq, p, 0); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); - } -out_unlock: - task_rq_unlock(rq, &flags); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; - - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - - nice = PRIO_TO_NICE(current->static_prio) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * This is the priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * task_nice - return the nice value of a given task. - * @p: the task in question. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - */ -int idle_cpu(int cpu) -{ - return cpu_curr(cpu) == cpu_rq(cpu)->idle; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* Actually do priority change: must hold rq lock. */ -static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) -{ - BUG_ON(p->se.on_rq); - - p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); - set_load_weight(p); -} - -static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) -{ - int retval, oldprio, oldpolicy = -1, on_rq, running; - unsigned long flags; - const struct sched_class *prev_class = p->sched_class; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -recheck: - /* double check policy once rq lock held */ - if (policy < 0) - policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; - unlock_task_sighand(p, &flags); - - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; - } - /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: - */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; - - /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) - return -EPERM; - } - - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; - } - - /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - */ - spin_lock_irqsave(&p->pi_lock, flags); - /* - * To be able to change p->policy safely, the apropriate - * runqueue lock must be held. - */ - rq = __task_rq_lock(p); - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - goto recheck; - } - update_rq_clock(rq); - on_rq = p->se.on_rq; - running = task_current(rq, p); - if (on_rq) - deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - oldprio = p->prio; - __setscheduler(rq, p, policy, param->sched_priority); - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) { - activate_task(rq, p, 0); - - check_class_changed(rq, p, prev_class, oldprio, running); - } - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - - rt_mutex_adjust_pi(p); - - return 0; -} - -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, true); -} -EXPORT_SYMBOL_GPL(sched_setscheduler); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, false); -} - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, - struct sched_param __user *, param) -{ - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy; - } - read_unlock(&tasklist_lock); - return retval; -} - -/** - * sys_sched_getscheduler - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) -{ - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; - struct task_struct *p; - int retval; - - get_online_cpus(); - read_lock(&tasklist_lock); - - p = find_process_by_pid(pid); - if (!p) { - read_unlock(&tasklist_lock); - put_online_cpus(); - return -ESRCH; - } - - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ - get_task_struct(p); - read_unlock(&tasklist_lock); - - retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - goto out_unlock; - - retval = security_task_setscheduler(p, 0, NULL); - if (retval) - goto out_unlock; - - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); - again: - retval = set_cpus_allowed_ptr(p, &new_mask); - - if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - new_mask = cpus_allowed; - goto again; - } - } -out_unlock: - put_task_struct(p); - put_online_cpus(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) -{ - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_t new_mask; - int retval; - - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; - - return sched_setaffinity(pid, &new_mask); -} - -long sched_getaffinity(pid_t pid, cpumask_t *mask) -{ - struct task_struct *p; - int retval; - - get_online_cpus(); - read_lock(&tasklist_lock); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - cpus_and(*mask, p->cpus_allowed, cpu_online_map); - -out_unlock: - read_unlock(&tasklist_lock); - put_online_cpus(); - - return retval; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_t mask; - - if (len < sizeof(cpumask_t)) - return -EINVAL; - - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; - - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; - - return sizeof(cpumask_t); -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct rq *rq = this_rq_lock(); - - schedstat_inc(rq, yld_count); - current->sched_class->yield_task(rq); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); - - schedule(); - - return 0; -} - -static void __cond_resched(void) -{ -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__); -#endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); -} - -int __sched _cond_resched(void) -{ - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { - __cond_resched(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(_cond_resched); - -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int cond_resched_lock(spinlock_t *lock) -{ - int resched = need_resched() && system_state == SYSTEM_RUNNING; - int ret = 0; - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched && need_resched()) - __cond_resched(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; -} -EXPORT_SYMBOL(cond_resched_lock); - -int __sched cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (need_resched() && system_state == SYSTEM_RUNNING) { - local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(cond_resched_softirq); - -/** - * yield - yield the current processor to other threads. - * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) - */ -void __sched io_schedule(void) -{ - struct rq *rq = &__raw_get_cpu_var(runqueues); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - schedule(); - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) -{ - struct rq *rq = &__raw_get_cpu_var(runqueues); - long ret; - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - ret = schedule_timeout(timeout); - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); - return ret; -} - -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) -{ - struct task_struct *p; - unsigned int time_slice; - int retval; - struct timespec t; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - read_lock(&tasklist_lock); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - /* - * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER - * tasks that are on an otherwise idle runqueue: - */ - time_slice = 0; - if (p->policy == SCHED_RR) { - time_slice = DEF_TIMESLICE; - } else if (p->policy != SCHED_FIFO) { - struct sched_entity *se = &p->se; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(p, &flags); - if (rq->cfs.load.weight) - time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); - } - read_unlock(&tasklist_lock); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; - -out_unlock: - read_unlock(&tasklist_lock); - return retval; -} - -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - -void sched_show_task(struct task_struct *p) -{ - unsigned long free = 0; - unsigned state; - - state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } -#endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); - - show_stack(p, NULL); -} - -void show_state_filter(unsigned long state_filter) -{ - struct task_struct *g, *p; - -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - read_lock(&tasklist_lock); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: - */ - touch_nmi_watchdog(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); - } while_each_thread(g, p); - - touch_all_softlockup_watchdogs(); - -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); -#endif - read_unlock(&tasklist_lock); - /* - * Only show locks if all tasks are dumped: - */ - if (state_filter == -1) - debug_show_all_locks(); -} - -void __cpuinit init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. - */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - __sched_fork(idle); - idle->se.exec_start = sched_clock(); - - idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); - __set_task_cpu(idle, cpu); - - spin_lock_irqsave(&rq->lock, flags); - rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; -#endif - spin_unlock_irqrestore(&rq->lock, flags); - - /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else - task_thread_info(idle)->preempt_count = 0; -#endif - /* - * The idle tasks have their own, simple scheduling class: - */ - idle->sched_class = &idle_sched_class; -} - -/* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. - */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; - -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static inline void sched_init_granularity(void) -{ - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; - - sysctl_sched_wakeup_granularity *= factor; - - sysctl_sched_shares_ratelimit *= factor; -} - -#ifdef CONFIG_SMP -/* - * This is how migration works: - * - * 1) we queue a struct migration_req structure in the source CPU's - * runqueue and wake up that CPU's migration thread. - * 2) we down() the locked semaphore => thread blocks. - * 3) migration thread wakes up (implicitly it forces the migrated - * thread off the CPU) - * 4) it gets the migration request and checks whether the migrated - * task is still in the wrong runqueue. - * 5) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 6) migration thread up()s the semaphore. - * 7) we wake up and the migration is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) -{ - struct migration_req req; - unsigned long flags; - struct rq *rq; - int ret = 0; - - rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { - ret = -EINVAL; - goto out; - } - - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { - ret = -EINVAL; - goto out; - } - - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); - } - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) - goto out; - - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); - wait_for_completion(&req.done); - tlb_migrate_finish(p->mm); - return 0; - } -out: - task_rq_unlock(rq, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); - - double_rq_lock(rq_src, rq_dest); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) - goto fail; - - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { - activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } -done: - ret = 1; -fail: - double_rq_unlock(rq_src, rq_dest); - return ret; -} - -/* - * migration_thread - this is a highprio system thread that performs - * thread migration by bumping thread off CPU then 'pushing' onto - * another runqueue. - */ -static int migration_thread(void *data) -{ - int cpu = (long)data; - struct rq *rq; - - rq = cpu_rq(cpu); - BUG_ON(rq->migration_thread != current); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - struct migration_req *req; - struct list_head *head; - - spin_lock_irq(&rq->lock); - - if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); - goto wait_to_die; - } - - if (rq->active_balance) { - active_load_balance(rq, cpu); - rq->active_balance = 0; - } - - head = &rq->migration_queue; - - if (list_empty(head)) { - spin_unlock_irq(&rq->lock); - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - continue; - } - req = list_entry(head->next, struct migration_req, list); - list_del_init(head->next); - - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); - - complete(&req->done); - } - __set_current_state(TASK_RUNNING); - return 0; - -wait_to_die: - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ - int ret; - - local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); - local_irq_enable(); - return ret; -} - -/* - * Figure out where task on dead CPU should go, use force if necessary. - * NOTE: interrupts should be disabled by the caller - */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) -{ - unsigned long flags; - cpumask_t mask; - struct rq *rq; - int dest_cpu; - - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); -} - -/* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); - unsigned long flags; - - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); -} - -/* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. - */ -void sched_idle_next(void) -{ - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - update_rq_clock(rq); - activate_task(rq, p, 0); - - spin_unlock_irqrestore(&rq->lock, flags); -} - -/* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. - */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) -{ - struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); - - /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. - */ - spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; - - for ( ; ; ) { - if (!rq->nr_running) - break; - update_rq_clock(rq); - next = pick_next_task(rq, rq->curr); - if (!next) - break; - next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); - - } -} -#endif /* CONFIG_HOTPLUG_CPU */ - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {0, }, -}; - -static struct ctl_table sd_ctl_root[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {0, }, -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(12); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ - - return table; -} - -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_online_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_online_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif - -static void set_rq_online(struct rq *rq) -{ - if (!rq->online) { - const struct sched_class *class; - - cpu_set(rq->cpu, rq->rd->online); - rq->online = 1; - - for_each_class(class) { - if (class->rq_online) - class->rq_online(rq); - } - } -} - -static void set_rq_offline(struct rq *rq) -{ - if (rq->online) { - const struct sched_class *class; - - for_each_class(class) { - if (class->rq_offline) - class->rq_offline(rq); - } - - cpu_clear(rq->cpu, rq->rd->online); - rq->online = 0; - } -} - -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int __cpuinit -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - struct task_struct *p; - int cpu = (long)hcpu; - unsigned long flags; - struct rq *rq; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); - if (IS_ERR(p)) - return NOTIFY_BAD; - kthread_bind(p, cpu); - /* Must be high prio: stop_machine expects to yield to it. */ - rq = task_rq_lock(p, &flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rq, &flags); - cpu_rq(cpu)->migration_thread = p; - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - /* Strictly unnecessary, as first user will wake it. */ - wake_up_process(cpu_rq(cpu)->migration_thread); - - /* Update our root-domain */ - rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); - - set_rq_online(rq); - } - spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!cpu_rq(cpu)->migration_thread) - break; - /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); - kthread_stop(cpu_rq(cpu)->migration_thread); - cpu_rq(cpu)->migration_thread = NULL; - break; - - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ - migrate_live_tasks(cpu); - rq = cpu_rq(cpu); - kthread_stop(rq->migration_thread); - rq->migration_thread = NULL; - /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); - update_rq_clock(rq); - deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); - cpuset_unlock(); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - - /* - * No need to migrate the tasks: it was best-effort if - * they didn't take sched_hotcpu_mutex. Just wake up - * the requestors. - */ - spin_lock_irq(&rq->lock); - while (!list_empty(&rq->migration_queue)) { - struct migration_req *req; - - req = list_entry(rq->migration_queue.next, - struct migration_req, list); - list_del_init(&req->list); - spin_unlock_irq(&rq->lock); - complete(&req->done); - spin_lock_irq(&rq->lock); - } - spin_unlock_irq(&rq->lock); - break; - - case CPU_DYING: - case CPU_DYING_FROZEN: - /* Update our root-domain */ - rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); - set_rq_offline(rq); - } - spin_unlock_irqrestore(&rq->lock, flags); - break; -#endif - } - return NOTIFY_OK; -} - -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. - */ -static struct notifier_block __cpuinitdata migration_notifier = { - .notifier_call = migration_call, - .priority = 10 -}; - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Start one for the boot CPU: */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - return err; -} -early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP - -#ifdef CONFIG_SCHED_DEBUG - -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) -{ - struct sched_group *group = sd->groups; - char str[256]; - - cpulist_scnprintf(str, sizeof(str), sd->span); - cpus_clear(*groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); - - if (!cpu_isset(cpu, sd->span)) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - if (!cpu_isset(cpu, group->cpumask)) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - } - - printk(KERN_DEBUG "%*s groups:", level + 1, ""); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } - - if (!group->__cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); - break; - } - - if (!cpus_weight(group->cpumask)) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } - - if (cpus_intersects(*groupmask, group->cpumask)) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } - - cpus_or(*groupmask, *groupmask, group->cpumask); - - cpulist_scnprintf(str, sizeof(str), group->cpumask); - printk(KERN_CONT " %s", str); - - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); - - if (!cpus_equal(sd->span, *groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - cpumask_t *groupmask; - int level = 0; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } - kfree(groupmask); -} -#else /* !CONFIG_SCHED_DEBUG */ -# define sched_domain_debug(sd, cpu) do { } while (0) -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpus_weight(sd->span) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpus_equal(sd->span, parent->span)) - return 0; - - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); - } - if (~cflags & pflags) - return 0; - - return 1; -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - - if (rq->rd) { - struct root_domain *old_rd = rq->rd; - - if (cpu_isset(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpu_clear(rq->cpu, old_rd->span); - - if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) - set_rq_online(rq); - - spin_unlock_irqrestore(&rq->lock, flags); -} - -static void init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - cpus_clear(rd->span); - cpus_clear(rd->online); - - cpupri_init(&rd->cpupri); -} - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - init_rootdomain(rd); - - return rd; -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - sd = sd->parent; - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - rcu_assign_pointer(rq->sd, sd); -} - -/* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpus_clear(*covered); - - for_each_cpu_mask_nr(i, *span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpu_isset(i, *covered)) - continue; - - cpus_clear(sg->cpumask); - sg->__cpu_power = 0; - - for_each_cpu_mask_nr(j, *span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpu_set(j, *covered); - cpu_set(j, sg->cpumask); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - -#define SD_NODES_PER_DOMAIN 16 - -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = 0; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, cpumask_t *span) -{ - nodemask_t used_nodes; - node_to_cpumask_ptr(nodemask, node); - int i; - - cpus_clear(*span); - nodes_clear(used_nodes); - - cpus_or(*span, *span, *nodemask); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - - node_to_cpumask_ptr_next(nodemask, next_node); - cpus_or(*span, *span, *nodemask); - } -} -#endif /* CONFIG_NUMA */ - -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - -/* - * SMT sched-domains: - */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); - -static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ - -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ - -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) -{ - int group; - - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); - if (sg) - *sg = &per_cpu(sched_group_core, group); - return group; -} -#elif defined(CONFIG_SCHED_MC) -static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_core, cpu); - return cpu; -} -#endif - -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); - -static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) -{ - int group; -#ifdef CONFIG_SCHED_MC - *mask = cpu_coregroup_map(cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); -#elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_phys, group); - return group; -} - -#ifdef CONFIG_NUMA -/* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. - */ -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); - -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) -{ - int group; - - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); - group = first_cpu(*nodemask); - - if (sg) - *sg = &per_cpu(sched_group_allnodes, group); - return group; -} - -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu_mask_nr(j, sg->cpumask) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg_inc_cpu_power(sg, sd->groups->__cpu_power); - } - sg = sg->next; - } while (sg != group_head); -} -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) -{ - int cpu, i; - - for_each_cpu_mask_nr(cpu, *cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (!sched_group_nodes) - continue; - - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -} -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) -{ -} -#endif /* CONFIG_NUMA */ - -/* - * Initialize sched groups cpu_power. - * - * cpu_power indicates the capacity of sched group, which is used while - * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. - */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) -{ - struct sched_domain *child; - struct sched_group *group; - - WARN_ON(!sd || !sd->groups); - - if (cpu != first_cpu(sd->groups->cpumask)) - return; - - child = sd->child; - - sd->groups->__cpu_power = 0; - - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); - return; - } - - /* - * add cpu_power of each child group to this groups cpu_power - */ - group = child->groups; - do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); - group = group->next; - } while (group != child->groups); -} - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -#define SD_INIT(sd, type) sd_init_##type(sd) -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif - -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_ALLOC 1 -#define SCHED_CPUMASK_FREE(v) kfree(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -#else -#define SCHED_CPUMASK_ALLOC 0 -#define SCHED_CPUMASK_FREE(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - -static int default_relax_domain_level = -1; - -static int __init setup_relax_domain_level(char *str) -{ - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) - default_relax_domain_level = val; - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); - } -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int __build_sched_domains(const cpumask_t *cpu_map, - struct sched_domain_attr *attr) -{ - int i; - struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; -#ifdef CONFIG_NUMA - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - /* - * Allocate the per-node list of sched groups - */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; - } -#endif - - rd = alloc_rootdomain(); - if (!rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - -#if SCHED_CPUMASK_ALLOC - /* get space for all scratch cpumask variables */ - allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } -#endif - tmpmask = (cpumask_t *)allmasks; - - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; -#endif - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); - - *nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(*nodemask, *nodemask, *cpu_map); - -#ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { - sd = &per_cpu(allnodes_domains, i); - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); - p = sd; - sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i); - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->parent = p; - if (p) - p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); -#endif - - p = sd; - sd = &per_cpu(phys_domains, i); - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - sd->span = *nodemask; - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i); - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); -#endif - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i); - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); -#endif - } - -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) - continue; - - init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group, - send_covered, tmpmask); - } -#endif - -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_core_map = cpu_coregroup_map(i); - cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) - continue; - - init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group, - send_covered, tmpmask); - } -#endif - - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) - continue; - - init_sched_build_groups(nodemask, cpu_map, - &cpu_to_phys_group, - send_covered, tmpmask); - } - -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - send_covered, tmpmask); - } - - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); - int j; - - *nodemask = node_to_cpumask(i); - cpus_clear(*covered); - - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { - sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); - goto error; - } - sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j); - sd->groups = sg; - } - sg->__cpu_power = 0; - sg->cpumask = *nodemask; - sg->next = sg; - cpus_or(*covered, *covered, *nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % nr_node_ids; - node_to_cpumask_ptr(pnodemask, n); - - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) - break; - - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - sg->cpumask = *tmpmask; - sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); - prev->next = sg; - prev = sg; - } - } -#endif - - /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); - - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); - - init_sched_groups_power(i, sd); - } -#endif - - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); - - init_sched_groups_power(i, sd); - } - -#ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); - - if (sd_allnodes) { - struct sched_group *sg; - - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, - tmpmask); - init_numa_sched_groups_power(sg); - } -#endif - - /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); -#else - sd = &per_cpu(phys_domains, i); -#endif - cpu_attach_domain(sd, rd, i); - } - - SCHED_CPUMASK_FREE((void *)allmasks); - return 0; - -#ifdef CONFIG_NUMA -error: - free_sched_groups(cpu_map, tmpmask); - SCHED_CPUMASK_FREE((void *)allmasks); - return -ENOMEM; -#endif -} - -static int build_sched_domains(const cpumask_t *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); -} - -static cpumask_t *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. - */ -static cpumask_t fallback_doms; - -void __attribute__((weak)) arch_update_cpu_topology(void) -{ -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!doms_cur) - doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); - dattr_cur = NULL; - err = build_sched_domains(doms_cur); - register_sched_domain_sysctl(); - - return err; -} - -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) -{ - free_sched_groups(cpu_map, tmpmask); -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const cpumask_t *cpu_map) -{ - cpumask_t tmpmask; - int i; - - unregister_sched_domain_sysctl(); - - for_each_cpu_mask_nr(i, *cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_map. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); -match1: - ; - } - - if (doms_new == NULL) { - ndoms_cur = 0; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, - dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) - kfree(doms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); - - return 0; -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - int ret; - - if (buf[0] != '0' && buf[0] != '1') - return -EINVAL; - - if (smt) - sched_smt_power_savings = (buf[0] == '1'); - else - sched_mc_power_savings = (buf[0] == '1'); - - ret = arch_reinit_sched_domains(); - - return ret ? ret : count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, - char *page) -{ - return sprintf(page, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, - char *page) -{ - return sprintf(page, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - -#ifndef CONFIG_CPUSETS -/* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. - */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - partition_sched_domains(1, NULL, NULL); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} -#endif - -static int update_runtime(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -void __init sched_init_smp(void) -{ - cpumask_t non_isolated_cpus; - -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); - mutex_unlock(&sched_domains_mutex); - put_online_cpus(); - -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif - - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - - init_hrtick(); - - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) - BUG(); - sched_init_granularity(); -} -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - -int in_sched_functions(unsigned long addr) -{ - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; -#endif - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -} - -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; -#endif -#ifdef CONFIG_SMP - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - - tg->se[cpu] = se; - /* se could be NULL for init_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; - se->parent = parent; -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); - rt_rq->tg = tg; - rt_rq->rt_se = rt_se; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); - - tg->rt_se[cpu] = rt_se; - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} -#endif - -void __init sched_init(void) -{ - int i, j; - unsigned long alloc_size = 0, ptr; - -#ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_USER_SCHED - alloc_size *= 2; -#endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ - if (alloc_size) { - ptr = (unsigned long)alloc_bootmem(alloc_size); - -#ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - init_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - init_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#ifdef CONFIG_USER_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - } - -#ifdef CONFIG_SMP - init_defrootdomain(); -#endif - - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - -#ifdef CONFIG_RT_GROUP_SCHED - init_rt_bandwidth(&init_task_group.rt_bandwidth, - global_rt_period(), global_rt_runtime()); -#ifdef CONFIG_USER_SCHED - init_rt_bandwidth(&root_task_group.rt_bandwidth, - global_rt_period(), RUNTIME_INF); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_GROUP_SCHED - list_add(&init_task_group.list, &task_groups); - INIT_LIST_HEAD(&init_task_group.children); - -#ifdef CONFIG_USER_SCHED - INIT_LIST_HEAD(&root_task_group.children); - init_task_group.parent = &root_task_group; - list_add(&init_task_group.siblings, &root_task_group.children); -#endif /* CONFIG_USER_SCHED */ -#endif /* CONFIG_GROUP_SCHED */ - - for_each_possible_cpu(i) { - struct rq *rq; - - rq = cpu_rq(i); - spin_lock_init(&rq->lock); - rq->nr_running = 0; - init_cfs_rq(&rq->cfs, rq); - init_rt_rq(&rq->rt, rq); -#ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.shares = init_task_group_load; - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED - /* - * How much cpu bandwidth does init_task_group get? - * - * In case of task-groups formed thr' the cgroup filesystem, it - * gets 100% of the cpu resources in the system. This overall - * system cpu resource is divided among the tasks of - * init_task_group and its child task-groups in a fair manner, - * based on each entity's (task or task-group's) weight - * (se->load.weight). - * - * In other words, if init_task_group has 10 tasks of weight - * 1024) and two child groups A0 and A1 (of weight 1024 each), - * then A0's share of the cpu resource is: - * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% - * - * We achieve this by letting init_task_group's tasks sit - * directly in rq->cfs (i.e init_task_group->se[] = NULL). - */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - root_task_group.shares = NICE_0_LOAD; - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); - /* - * In case of task-groups formed thr' the user id of tasks, - * init_task_group represents tasks belonging to root user. - * Hence it forms a sibling of all subsequent groups formed. - * In this case, init_task_group gets only a fraction of overall - * system cpu resource, based on the weight assigned to root - * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished - * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of - * tasks in rq->cfs (i.e init_task_group->se[] != NULL). - */ - init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1, - root_task_group.se[i]); - -#endif -#endif /* CONFIG_FAIR_GROUP_SCHED */ - - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; -#ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#elif defined CONFIG_USER_SCHED - init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); - init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1, - root_task_group.rt_se[i]); -#endif -#endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; -#ifdef CONFIG_SMP - rq->sd = NULL; - rq->rd = NULL; - rq->active_balance = 0; - rq->next_balance = jiffies; - rq->push_cpu = 0; - rq->cpu = i; - rq->online = 0; - rq->migration_thread = NULL; - INIT_LIST_HEAD(&rq->migration_queue); - rq_attach_root(rq, &def_root_domain); -#endif - init_rq_hrtick(rq); - atomic_set(&rq->nr_iowait, 0); - } - - set_load_weight(&init_task); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#endif - -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); -#endif - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current); - - /* - * Make us the idle thread. Technically, schedule() should not be - * called from this thread, however somewhere below it might be, - * but because we are the idle thread, we just pick up running again - * when this runqueue becomes "idle". - */ - init_idle(current, smp_processor_id()); - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - - scheduler_running = 1; -} - -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) -{ -#ifdef in_atomic - static unsigned long prev_jiffy; /* ratelimiting */ - - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } -#endif -} -EXPORT_SYMBOL(__might_sleep); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) -{ - int on_rq; - - update_rq_clock(rq); - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - activate_task(rq, p, 0); - resched_task(rq->curr); - } -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; - - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { - /* - * Only normalize user tasks: - */ - if (!p->mm) - continue; - - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.wait_start = 0; - p->se.sleep_start = 0; - p->se.block_start = 0; -#endif - - if (!rt_task(p)) { - /* - * Renice negative nice level userspace - * tasks back to 0: - */ - if (TASK_NICE(p) < 0 && p->mm) - set_user_nice(p, 0); - continue; - } - - spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - - normalize_task(rq, p); - - __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); -} - -#endif /* CONFIG_MAGIC_SYSRQ */ - -#ifdef CONFIG_IA64 -/* - * These functions are only useful for the IA64 MCA handling. - * - * They can only be called when the whole system has been - * stopped - every CPU needs to be quiescent, and no scheduling - * activity can take place. Using them for anything else would - * be a serious bug, and as a result, they aren't even visible - * under any other configuration. - */ - -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -struct task_struct *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void free_fair_sched_group(struct task_group *tg) -{ - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -static -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; - struct rq *rq; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - for_each_possible_cpu(i) { - rq = cpu_rq(i); - - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!se) - goto err; - - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); - } - - return 1; - - err: - return 0; -} - -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); -} -#else /* !CONFG_FAIR_GROUP_SCHED */ -static inline void free_fair_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg) -{ - int i; - - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -static -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; - struct rq *rq; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rq = cpu_rq(i); - - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!rt_se) - goto err; - - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); - } - - return 1; - - err: - return 0; -} - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} -#else /* !CONFIG_RT_GROUP_SCHED */ -static inline void free_rt_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_GROUP_SCHED -static void free_sched_group(struct task_group *tg) -{ - free_fair_sched_group(tg); - free_rt_sched_group(tg); - kfree(tg); -} - -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(struct task_group *parent) -{ - struct task_group *tg; - unsigned long flags; - int i; - - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - - if (!alloc_fair_sched_group(tg, parent)) - goto err; - - if (!alloc_rt_sched_group(tg, parent)) - goto err; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } - list_add_rcu(&tg->list, &task_groups); - - WARN_ON(!parent); /* root should already exist */ - - tg->parent = parent; - INIT_LIST_HEAD(&tg->children); - list_add_rcu(&tg->siblings, &parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); -} - -/* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) -{ - /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); -} - -/* Destroy runqueue etc associated with a task group */ -void sched_destroy_group(struct task_group *tg) -{ - unsigned long flags; - int i; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } - list_del_rcu(&tg->list); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); -} - -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) -{ - int on_rq, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - update_rq_clock(rq); - - running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; - - if (on_rq) - dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - - set_task_rq(tsk, task_cpu(tsk)); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); -#endif - - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, tsk, 0); - - task_rq_unlock(rq, &flags); -} -#endif /* CONFIG_GROUP_SCHED */ - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); -} - -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ - tg->shares = shares; - for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); - } - - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); -done: - mutex_unlock(&shares_mutex); - return 0; -} - -unsigned long sched_group_shares(struct task_group *tg) -{ - return tg->shares; -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 16; - - return div64_u64(runtime << 16, period); -} - -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; - - if (!parent) { - if (global_rt_period() < period) - return 0; - - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } - - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; - - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; - - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); - } - rcu_read_unlock(); - - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); - - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; - - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); - } - rcu_read_unlock(); - - return total + to_ratio(period, runtime) < global_ratio; -} -#endif - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; -} - -static int tg_set_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; - goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } - - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); - } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = (u64)rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - if (rt_period == 0) - return -EINVAL; - - return tg_set_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} - -static int sched_rt_global_constraints(void) -{ - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; - int ret = 0; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; - - mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; - mutex_unlock(&rt_constraints_mutex); - - return ret; -} -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); - } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } - } - mutex_unlock(&mutex); - - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED - -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); -} - -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg, *parent; - - if (!cgrp->parent) { - /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; - return &init_task_group.css; - } - - parent = cgroup_tg(cgrp->parent); - tg = sched_create_group(parent); - if (IS_ERR(tg)) - return ERR_PTR(-ENOMEM); - - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - - return &tg->css; -} - -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg = cgroup_tg(cgrp); - - sched_destroy_group(tg); -} - -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) -{ -#ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) - return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; -#endif - - return 0; -} - -static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) -{ - sched_move_task(tsk); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) -{ - return sched_group_set_shares(cgroup_tg(cgrp), shareval); -} - -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) -{ - struct task_group *tg = cgroup_tg(cgrp); - - return (u64) tg->shares; -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) -{ - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); -} - -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_runtime(cgroup_tg(cgrp)); -} - -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) -{ - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); -} - -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_period(cgroup_tg(cgrp)); -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", - .read_u64 = cpu_shares_read_u64, - .write_u64 = cpu_shares_write_u64, - }, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif -}; - -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .populate = cpu_cgroup_populate, - .subsys_id = cpu_cgroup_subsys_id, - .early_init = 1, -}; - -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -/* track cpu usage of a group of tasks */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 *cpuusage; -}; - -struct cgroup_subsys cpuacct_subsys; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - - if (!ca) - return ERR_PTR(-ENOMEM); - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } - - return &ca->css; -} - -/* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpuusage); - kfree(ca); -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } -out: - return err; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, -}; - -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -static void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - - if (!cpuacct_subsys.active) - return; - - ca = task_ca(tsk); - if (ca) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); - - *cpuusage += cputime; - } -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, - .populate = cpuacct_populate, - .subsys_id = cpuacct_subsys_id, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ -/* - * sched_clock for unstable cpu clocks - * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra - * - * Updates and enhancements: - * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt - * - * Based on code by: - * Ingo Molnar - * Guillaume Chazarain - * - * Create a semi stable clock from a mixture of other events, including: - * - gtod - * - sched_clock() - * - explicit idle events - * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. - * - * Furthermore, explicit sleep and wakeup hooks allow us to account for time - * that is otherwise invisible (TSC gets stopped). - * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). - */ -#include -#include -#include -#include -#include - -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - -static __read_mostly int sched_clock_running; - -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - -struct sched_clock_data { - /* - * Raw spinlock - this is a special case: this might be called - * from within instrumentation code so we dont want to do any - * instrumentation ourselves. - */ - raw_spinlock_t lock; - - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); - -static inline struct sched_clock_data *this_scd(void) -{ - return &__get_cpu_var(sched_clock_data); -} - -static inline struct sched_clock_data *cpu_sdc(int cpu) -{ - return &per_cpu(sched_clock_data, cpu); -} - -void sched_clock_init(void) -{ - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; -} - -/* - * min,max except they take wrapping into account - */ - -static inline u64 wrap_min(u64 x, u64 y) -{ - return (s64)(x - y) < 0 ? x : y; -} - -static inline u64 wrap_max(u64 x, u64 y) -{ - return (s64)(x - y) > 0 ? x : y; -} - -/* - * update the percpu scd from the raw @now value - * - * - filter out backward motion - * - use the GTOD tick value to create a window to filter crazy TSC values - */ -static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) -{ - s64 delta = now - scd->tick_raw; - u64 clock, min_clock, max_clock; - - WARN_ON_ONCE(!irqs_disabled()); - - if (unlikely(delta < 0)) - delta = 0; - - /* - * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); - */ - - clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); - - clock = wrap_max(clock, min_clock); - clock = wrap_min(clock, max_clock); - - scd->clock = clock; - - return scd->clock; -} - -static void lock_double_clock(struct sched_clock_data *data1, - struct sched_clock_data *data2) -{ - if (data1 < data2) { - __raw_spin_lock(&data1->lock); - __raw_spin_lock(&data2->lock); - } else { - __raw_spin_lock(&data2->lock); - __raw_spin_lock(&data1->lock); - } -} - -u64 sched_clock_cpu(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - u64 now, clock, this_clock, remote_clock; - - if (unlikely(!sched_clock_running)) - return 0ull; - - WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); - - if (cpu != raw_smp_processor_id()) { - struct sched_clock_data *my_scd = this_scd(); - - lock_double_clock(scd, my_scd); - - this_clock = __update_sched_clock(my_scd, now); - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - clock = this_clock; - scd->clock = clock; - } else { - /* - * Should be rare, but possible: - */ - clock = remote_clock; - my_scd->clock = remote_clock; - } - - __raw_spin_unlock(&my_scd->lock); - } else { - __raw_spin_lock(&scd->lock); - clock = __update_sched_clock(scd, now); - } - - __raw_spin_unlock(&scd->lock); - - return clock; -} - -void sched_clock_tick(void) -{ - struct sched_clock_data *scd = this_scd(); - u64 now, now_gtod; - - if (unlikely(!sched_clock_running)) - return; - - WARN_ON_ONCE(!irqs_disabled()); - - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); - - __raw_spin_lock(&scd->lock); - scd->tick_raw = now; - scd->tick_gtod = now_gtod; - __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); -} - -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - sched_clock_cpu(smp_processor_id()); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - if (timekeeping_suspended) - return; - - sched_clock_tick(); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - -u64 sched_clock_cpu(int cpu) -{ - if (unlikely(!sched_clock_running)) - return 0; - - return sched_clock(); -} - -#endif - -unsigned long long cpu_clock(int cpu) -{ - unsigned long long clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); - - return clock; -} -EXPORT_SYMBOL_GPL(cpu_clock); -/* - * kernel/sched_cpupri.c - * - * CPU priority management - * - * Copyright (C) 2007-2008 Novell - * - * Author: Gregory Haskins - * - * This code tracks the priority of each CPU so that global migration - * decisions are easy to calculate. Each CPU can be in a state as follows: - * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 - * - * going from the lowest priority to the highest. CPUs in the INVALID state - * are not eligible for routing. The system maintains this state with - * a 2 dimensional bitmap (the first for priority class, the second for cpus - * in that class). Therefore a typical application without affinity - * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit - * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that - * yields the worst case search is fairly contrived. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include "sched_cpupri.h" - -/* Convert between a 140 based task->prio, and our 102 based cpupri */ -static int convert_prio(int prio) -{ - int cpupri; - - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; - - return cpupri; -} - -#define for_each_cpupri_active(array, idx) \ - for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ - idx < CPUPRI_NR_PRIORITIES; \ - idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) - -/** - * cpupri_find - find the best (lowest-pri) CPU in the system - * @cp: The cpupri context - * @p: The task - * @lowest_mask: A mask to fill in with selected CPUs - * - * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in - * fact changed priorities any number of times. While not ideal, it is not - * an issue of correctness since the normal rebalancer logic will correct - * any discrepancies created by racing against the uncertainty of the current - * priority configuration. - * - * Returns: (int)bool - CPUs were found - */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, - cpumask_t *lowest_mask) -{ - int idx = 0; - int task_pri = convert_prio(p->prio); - - for_each_cpupri_active(cp->pri_active, idx) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - cpumask_t mask; - - if (idx >= task_pri) - break; - - cpus_and(mask, p->cpus_allowed, vec->mask); - - if (cpus_empty(mask)) - continue; - - *lowest_mask = mask; - return 1; - } - - return 0; -} - -/** - * cpupri_set - update the cpu priority setting - * @cp: The cpupri context - * @cpu: The target cpu - * @pri: The priority (INVALID-RT99) to assign to this CPU - * - * Note: Assumes cpu_rq(cpu)->lock is locked - * - * Returns: (void) - */ -void cpupri_set(struct cpupri *cp, int cpu, int newpri) -{ - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - unsigned long flags; - - newpri = convert_prio(newpri); - - BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); - - if (newpri == oldpri) - return; - - /* - * If the cpu was currently mapped to a different value, we - * first need to unmap the old value - */ - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); - cpu_clear(cpu, vec->mask); - - spin_unlock_irqrestore(&vec->lock, flags); - } - - if (likely(newpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - - spin_lock_irqsave(&vec->lock, flags); - - cpu_set(cpu, vec->mask); - vec->count++; - if (vec->count == 1) - set_bit(newpri, cp->pri_active); - - spin_unlock_irqrestore(&vec->lock, flags); - } - - *currpri = newpri; -} - -/** - * cpupri_init - initialize the cpupri structure - * @cp: The cpupri context - * - * Returns: (void) - */ -void cpupri_init(struct cpupri *cp) -{ - int i; - - memset(cp, 0, sizeof(*cp)); - - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - - spin_lock_init(&vec->lock); - vec->count = 0; - cpus_clear(vec->mask); - } - - for_each_possible_cpu(i) - cp->cpu_to_pri[i] = CPUPRI_INVALID; -} - - -/* - * kernel/time/sched_debug.c - * - * Print the CFS rbtree - * - * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include - -/* - * This allows printing both to /proc/sched_debug and - * to the console - */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) - -/* - * Ease the printing of nsec fields: - */ -static long long nsec_high(unsigned long long nsec) -{ - if ((long long)nsec < 0) { - nsec = -nsec; - do_div(nsec, 1000000); - return -nsec; - } - do_div(nsec, 1000000); - - return nsec; -} - -static unsigned long nsec_low(unsigned long long nsec) -{ - if ((long long)nsec < 0) - nsec = -nsec; - - return do_div(nsec, 1000000); -} - -#define SPLIT_NS(x) nsec_high(x), nsec_low(x) - -static void -print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) -{ - if (rq->curr == p) - SEQ_printf(m, "R"); - else - SEQ_printf(m, " "); - - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, - SPLIT_NS(p->se.vruntime), - (long long)(p->nvcsw + p->nivcsw), - p->prio); -#ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.vruntime), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.sum_sleep_runtime)); -#else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", - 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); -#endif - -#ifdef CONFIG_CGROUP_SCHED - { - char path[64]; - - cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); - SEQ_printf(m, " %s", path); - } -#endif - SEQ_printf(m, "\n"); -} - -static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) -{ - struct task_struct *g, *p; - unsigned long flags; - - SEQ_printf(m, - "\nrunnable tasks:\n" - " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" - "------------------------------------------------------" - "----------------------------------------------------\n"); - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) - continue; - - print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); -} - -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) -{ - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; - struct rq *rq = &per_cpu(runqueues, cpu); - struct sched_entity *last; - unsigned long flags; - -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128] = ""; - struct cgroup *cgroup = NULL; - struct task_group *tg = cfs_rq->tg; - - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); - - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif - - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); - - spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; - last = __pick_last_entity(cfs_rq); - if (last) - max_vruntime = last->vruntime; - min_vruntime = rq->cfs.min_vruntime; - rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; - spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - - P(yld_exp_empty); - P(yld_act_empty); - P(yld_both_empty); - P(yld_count); - - P(sched_switch); - P(sched_count); - P(sched_goidle); - - P(ttwu_count); - P(ttwu_local); - - P(bkl_count); - -#undef P -#endif - SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", - cfs_rq->nr_spread_over); -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); -#endif -#endif -} - -void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) -{ -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128] = ""; - struct cgroup *cgroup = NULL; - struct task_group *tg = rt_rq->tg; - - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); - - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); -#else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); -#endif - - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) -#define PN(x) \ - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) - - P(rt_nr_running); - P(rt_throttled); - PN(rt_time); - PN(rt_runtime); - -#undef PN -#undef P -} - -static void print_cpu(struct seq_file *m, int cpu) -{ - struct rq *rq = &per_cpu(runqueues, cpu); - -#ifdef CONFIG_X86 - { - unsigned int freq = cpu_khz ? : 1; - - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", - cpu, freq / 1000, (freq % 1000)); - } -#else - SEQ_printf(m, "\ncpu#%d\n", cpu); -#endif - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) -#define PN(x) \ - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) - - P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->load.weight); - P(nr_switches); - P(nr_load_updates); - P(nr_uninterruptible); - SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); - PN(next_balance); - P(curr->pid); - PN(clock); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); -#undef P -#undef PN - - print_cfs_stats(m, cpu); - print_rt_stats(m, cpu); - - print_rq(m, rq, cpu); -} - -static int sched_debug_show(struct seq_file *m, void *v) -{ - u64 now = ktime_to_ns(ktime_get()); - int cpu; - - SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); - -#define P(x) \ - SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) -#define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); - PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); - P(sysctl_sched_features); -#undef PN -#undef P - - for_each_online_cpu(cpu) - print_cpu(m, cpu); - - SEQ_printf(m, "\n"); - - return 0; -} - -static void sysrq_sched_debug_show(void) -{ - sched_debug_show(NULL, NULL); -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_debug_show, NULL); -} - -static const struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_sched_debug_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); - if (!pe) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{ - unsigned long nr_switches; - unsigned long flags; - int num_threads = 1; - - rcu_read_lock(); - if (lock_task_sighand(p, &flags)) { - num_threads = atomic_read(&p->signal->count); - unlock_task_sighand(p, &flags); - } - rcu_read_unlock(); - - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); - SEQ_printf(m, - "---------------------------------------------------------\n"); -#define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) - - PN(se.exec_start); - PN(se.vruntime); - PN(se.sum_exec_runtime); - PN(se.avg_overlap); - - nr_switches = p->nvcsw + p->nivcsw; - -#ifdef CONFIG_SCHEDSTATS - PN(se.wait_start); - PN(se.sleep_start); - PN(se.block_start); - PN(se.sleep_max); - PN(se.block_max); - PN(se.exec_max); - PN(se.slice_max); - PN(se.wait_max); - PN(se.wait_sum); - P(se.wait_count); - P(sched_info.bkl_count); - P(se.nr_migrations); - P(se.nr_migrations_cold); - P(se.nr_failed_migrations_affine); - P(se.nr_failed_migrations_running); - P(se.nr_failed_migrations_hot); - P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); - P(se.nr_wakeups); - P(se.nr_wakeups_sync); - P(se.nr_wakeups_migrate); - P(se.nr_wakeups_local); - P(se.nr_wakeups_remote); - P(se.nr_wakeups_affine); - P(se.nr_wakeups_affine_attempts); - P(se.nr_wakeups_passive); - P(se.nr_wakeups_idle); - - { - u64 avg_atom, avg_per_cpu; - - avg_atom = p->se.sum_exec_runtime; - if (nr_switches) - do_div(avg_atom, nr_switches); - else - avg_atom = -1LL; - - avg_per_cpu = p->se.sum_exec_runtime; - if (p->se.nr_migrations) { - avg_per_cpu = div64_u64(avg_per_cpu, - p->se.nr_migrations); - } else { - avg_per_cpu = -1LL; - } - - __PN(avg_atom); - __PN(avg_per_cpu); - } -#endif - __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", - "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", - "nr_involuntary_switches", (long long)p->nivcsw); - - P(se.load.weight); - P(policy); - P(prio); -#undef PN -#undef __PN -#undef P -#undef __P - - { - u64 t0, t1; - - t0 = sched_clock(); - t1 = sched_clock(); - SEQ_printf(m, "%-35s:%21Ld\n", - "clock-delta", (long long)(t1-t0)); - } -} - -void proc_sched_set_task(struct task_struct *p) -{ -#ifdef CONFIG_SCHEDSTATS - p->se.wait_max = 0; - p->se.wait_sum = 0; - p->se.wait_count = 0; - p->se.sleep_max = 0; - p->se.sum_sleep_runtime = 0; - p->se.block_max = 0; - p->se.exec_max = 0; - p->se.slice_max = 0; - p->se.nr_migrations = 0; - p->se.nr_migrations_cold = 0; - p->se.nr_failed_migrations_affine = 0; - p->se.nr_failed_migrations_running = 0; - p->se.nr_failed_migrations_hot = 0; - p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; - p->se.nr_wakeups = 0; - p->se.nr_wakeups_sync = 0; - p->se.nr_wakeups_migrate = 0; - p->se.nr_wakeups_local = 0; - p->se.nr_wakeups_remote = 0; - p->se.nr_wakeups_affine = 0; - p->se.nr_wakeups_affine_attempts = 0; - p->se.nr_wakeups_passive = 0; - p->se.nr_wakeups_idle = 0; - p->sched_info.bkl_count = 0; -#endif - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->nvcsw = 0; - p->nivcsw = 0; -} -/* - * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) - * - * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar - * - * Interactivity improvements by Mike Galbraith - * (C) 2007 Mike Galbraith - * - * Various enhancements by Dmitry Adamushko. - * (C) 2007 Dmitry Adamushko - * - * Group scheduling enhancements by Srivatsa Vaddagiri - * Copyright IBM Corporation, 2007 - * Author: Srivatsa Vaddagiri - * - * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner - * - * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - */ - -#include - -/* - * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - */ -unsigned int sysctl_sched_latency = 20000000ULL; - -/* - * Minimal preemption granularity for CPU-bound tasks: - * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_min_granularity = 4000000ULL; - -/* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 5; - -/* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. - */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; - -/* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - -/* - * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; - -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; - -/************************************************************** - * CFS operations on generic schedulable entities: - */ - -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return 1; - - return 0; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -#else /* CONFIG_FAIR_GROUP_SCHED */ - -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - -#define entity_is_task(se) 1 - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return NULL; -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - - -/************************************************************** - * Scheduling class tree data structure manipulation methods: - */ - -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return se->vruntime - cfs_rq->min_vruntime; -} - -/* - * Enqueue an entity into the rb-tree: - */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); - int leftmost = 1; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (key < entity_key(cfs_rq, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } - } - - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) { - cfs_rq->rb_leftmost = &se->run_node; - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, se->vruntime); - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); -} - -static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - struct sched_entity *next; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - - if (next_node) { - next = rb_entry(next_node, - struct sched_entity, run_node); - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, - next->vruntime); - } - } - - if (cfs_rq->next == se) - cfs_rq->next = NULL; - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); -} - -static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rb_leftmost; -} - -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) -{ - return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); -} - -static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) -{ - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - - if (!last) - return NULL; - - return rb_entry(last, struct sched_entity, run_node); -} - -/************************************************************** - * Scheduling class statistics methods: - */ - -#ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); - - if (ret || !write) - return ret; - - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - - return 0; -} -#endif - -/* - * delta *= w / rw - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - -/* - * delta *= rw / w - */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; -} - -/* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*w/rw - */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); -} - -/* - * We calculate the vruntime slice of a to be inserted task - * - * vs = s*rw/w = p - */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); -} - -/* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - curr->vruntime += delta_exec_weighted; -} - -static void update_curr(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; - unsigned long delta_exec; - - if (unlikely(!curr)) - return; - - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - - __update_curr(cfs_rq, curr, delta_exec); - curr->exec_start = now; - - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - cpuacct_charge(curtask, delta_exec); - } -} - -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); -} - -/* - * Task is being enqueued - update stats: - */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Are we enqueueing a waiting task? (for current tasks - * a dequeue/enqueue event is a NOP) - */ - if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); -} - -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - schedstat_set(se->wait_count, se->wait_count + 1); - schedstat_set(se->wait_sum, se->wait_sum + - rq_of(cfs_rq)->clock - se->wait_start); - schedstat_set(se->wait_start, 0); -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Mark the end of the wait period if dequeueing a - * waiting task: - */ - if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); -} - -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * We are starting a new run period: - */ - se->exec_start = rq_of(cfs_rq)->clock; -} - -/************************************************** - * Scheduling class queueing methods: - */ - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - -static void -account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, se->load.weight); - cfs_rq->nr_running++; - se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); -} - -static void -account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) - add_cfs_task_weight(cfs_rq, -se->load.weight); - cfs_rq->nr_running--; - se->on_rq = 0; - list_del_init(&se->group_node); -} - -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - if (se->sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; - struct task_struct *tsk = task_of(se); - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->sleep_max)) - se->sleep_max = delta; - - se->sleep_start = 0; - se->sum_sleep_runtime += delta; - - account_scheduler_latency(tsk, delta >> 10, 1); - } - if (se->block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->block_start; - struct task_struct *tsk = task_of(se); - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->block_max)) - se->block_max = delta; - - se->block_start = 0; - se->sum_sleep_runtime += delta; - - /* - * Blocking time is in units of nanosecs, so shift by 20 to - * get a milliseconds-range estimation of the amount of - * time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - - profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } -#endif -} - -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); -#endif -} - -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime; - - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(cfs_rq->min_vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); - - if (!initial) { - /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - unsigned long thresh = sysctl_sched_latency; - - /* - * convert the sleeper threshold into virtual time - */ - if (sched_feat(NORMALIZED_SLEEPER)) - thresh = calc_delta_fair(thresh, se); - - vruntime -= thresh; - } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - } - - se->vruntime = vruntime; -} - -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - account_entity_enqueue(cfs_rq, se); - - if (wakeup) { - place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); - } - - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); -} - -static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - update_stats_dequeue(cfs_rq, se); - if (sleep) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = rq_of(cfs_rq)->clock; - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = rq_of(cfs_rq)->clock; - } -#endif - } - - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - account_entity_dequeue(cfs_rq, se); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); -} - -static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* 'current' is not kept within the tree. */ - if (se->on_rq) { - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. - */ - update_stats_wait_end(cfs_rq, se); - __dequeue_entity(cfs_rq, se); - } - - update_stats_curr_start(cfs_rq, se); - cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS - /* - * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it - * when there are only lesser-weight tasks around): - */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->slice_max = max(se->slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); - } -#endif - se->prev_sum_exec_runtime = se->sum_exec_runtime; -} - -static struct sched_entity * -pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rq *rq = rq_of(cfs_rq); - u64 pair_slice = rq->clock - cfs_rq->pair_start; - - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { - cfs_rq->pair_start = rq->clock; - return se; - } - - return cfs_rq->next; -} - -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = NULL; - - if (first_fair(cfs_rq)) { - se = __pick_next_entity(cfs_rq); - se = pick_next(cfs_rq, se); - set_next_entity(cfs_rq, se); - } - - return se; -} - -static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) -{ - /* - * If still on the runqueue then deactivate_task() - * was not called and update_curr() has to be done: - */ - if (prev->on_rq) - update_curr(cfs_rq); - - check_spread(cfs_rq, prev); - if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); - /* Put 'current' back into the tree. */ - __enqueue_entity(cfs_rq, prev); - } - cfs_rq->curr = NULL; -} - -static void -entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - -#ifdef CONFIG_SCHED_HRTICK - /* - * queued ticks are scheduled to match the slice, so don't bother - * validating it and just reschedule. - */ - if (queued) { - resched_task(rq_of(cfs_rq)->curr); - return; - } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; -#endif - - if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) - check_preempt_tick(cfs_rq, curr); -} - -/************************************************** - * CFS operations on tasks: - */ - -#ifdef CONFIG_SCHED_HRTICK -static void hrtick_start_fair(struct rq *rq, struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - WARN_ON(task_rq(p) != rq); - - if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; - s64 delta = slice - ran; - - if (delta < 0) { - if (rq->curr == p) - resched_task(p); - return; - } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - - hrtick_start(rq, delta); - } -} -#else /* !CONFIG_SCHED_HRTICK */ -static inline void -hrtick_start_fair(struct rq *rq, struct task_struct *p) -{ -} -#endif - -/* - * The enqueue_task method is called before nr_running is - * increased. Here we update the fair scheduling stats and - * then put the task into the rbtree: - */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - if (se->on_rq) - break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; - } - - hrtick_start_fair(rq, rq->curr); -} - -/* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: - */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep); - /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) - break; - sleep = 1; - } - - hrtick_start_fair(rq, rq->curr); -} - -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(cfs_rq->nr_running == 1)) - return; - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) -static int wake_idle(int cpu, struct task_struct *p) -{ - cpumask_t tmp; - struct sched_domain *sd; - int i; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - -#ifdef CONFIG_SMP - -static const struct sched_class fair_sched_class; - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * - */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - long more_w; - - if (!tg->parent) - return wl; - - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - - long S, rw, s, a, b; - - S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; - - a = S*(rw + wl); - b = S*rw + s*wg; - - wl = s*(a-b)/D(b); - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ - wg = 0; -#undef D - } - - return wl; -} - -#else - -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) -{ - return wl; -} - -#endif - -static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) -{ - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; - unsigned long tl_per_task; - unsigned long weight; - int balanced; - - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.load.weight; - - tl += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.load.weight; - - balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - - return 1; - } - return 0; -} - -static int select_task_rq_fair(struct task_struct *p, int sync) -{ - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *rq, *this_rq; - unsigned int imbalance; - int idx; - - prev_cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; - - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { - this_sd = sd; - break; - } - } - - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) - goto out; - - idx = this_sd->wake_idx; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; - - if (prev_cpu == this_cpu) - goto out; - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; - } - } - -out: - return wake_idle(new_cpu, p); -} -#endif /* CONFIG_SMP */ - -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; - - /* - * More easily preempt - nice tasks, while not making it harder for - * + nice tasks. - */ - if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); - - return gran; -} - -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; - - if (unlikely(rt_prio(p->prio))) { - update_rq_clock(rq); - update_curr(cfs_rq); - resched_task(curr); - return; - } - - if (unlikely(se == pse)) - return; - - cfs_rq_of(pse)->next = pse; - - /* - * Batch tasks do not preempt (their preemption is driven by - * the tick): - */ - if (unlikely(p->policy == SCHED_BATCH)) - return; - - if (!sched_feat(WAKEUP_PREEMPT)) - return; - - if (sched_feat(WAKEUP_OVERLAP) && sync && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) { - resched_task(curr); - return; - } - - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - if (wakeup_preempt_entity(se, pse) == 1) - resched_task(curr); -} - -static struct task_struct *pick_next_task_fair(struct rq *rq) -{ - struct task_struct *p; - struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; - - if (unlikely(!cfs_rq->nr_running)) - return NULL; - - do { - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - hrtick_start_fair(rq, p); - - return p; -} - -/* - * Account for a descheduled task: - */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) -{ - struct sched_entity *se = &prev->se; - struct cfs_rq *cfs_rq; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); - } -} - -#ifdef CONFIG_SMP -/************************************************** - * Fair scheduling class load-balancing methods: - */ - -/* - * Load-balancing iterator. Note: while the runqueue stays locked - * during the whole iteration, the current task might be - * dequeued so the iterator has to be dequeue-safe. Here we - * achieve that by always pre-iterating before returning - * the current task: - */ -static struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) -{ - struct task_struct *p = NULL; - struct sched_entity *se; - - if (next == &cfs_rq->tasks) - return NULL; - - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); - - return p; -} - -static struct task_struct *load_balance_start_fair(void *arg) -{ - struct cfs_rq *cfs_rq = arg; - - return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); -} - -static struct task_struct *load_balance_next_fair(void *arg) -{ - struct cfs_rq *cfs_rq = arg; - - return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); -} - -static unsigned long -__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, - struct cfs_rq *cfs_rq) -{ - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - cfs_rq_iterator.arg = cfs_rq; - - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &cfs_rq_iterator); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; - - rcu_read_lock(); - update_h_load(busiest_cpu); - - list_for_each_entry(tg, &task_groups, list) { - struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; - u64 rem_load, moved_load; - - /* - * empty group - */ - if (!busiest_cfs_rq->task_weight) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, this_best_prio, - tg->cfs_rq[busiest_cpu]); - - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); - - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); - - return max_load_move - rem_load_move; -} -#else -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return __load_balance_fair(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); -} -#endif - -static int -move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - struct cfs_rq *busy_cfs_rq; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, - &cfs_rq_iterator)) - return 1; - } - - return 0; -} -#endif /* CONFIG_SMP */ - -/* - * scheduler tick hitting a task of our scheduling class: - */ -static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &curr->se; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - entity_tick(cfs_rq, se, queued); - } -} - -#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) - -/* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. - */ -static void task_new_fair(struct rq *rq, struct task_struct *p) -{ - struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se, *curr = cfs_rq->curr; - int this_cpu = smp_processor_id(); - - sched_info_queued(p); - - update_curr(cfs_rq); - place_entity(cfs_rq, se, 1); - - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && curr->vruntime < se->vruntime) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - } - - enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); -} - -/* - * Priority of the task has changed. Check to see if we preempt - * the current task. - */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) -{ - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); -} - -/* - * We switched to the sched_fair class. - */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) -{ - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); -} - -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_curr_task_fair(struct rq *rq) -{ - struct sched_entity *se = &rq->curr->se; - - for_each_sched_entity(se) - set_next_entity(cfs_rq_of(se), se); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) -{ - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); -} -#endif - -/* - * All the scheduling class methods: - */ -static const struct sched_class fair_sched_class = { - .next = &idle_sched_class, - .enqueue_task = enqueue_task_fair, - .dequeue_task = dequeue_task_fair, - .yield_task = yield_task_fair, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_fair, -#endif /* CONFIG_SMP */ - - .check_preempt_curr = check_preempt_wakeup, - - .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, - -#ifdef CONFIG_SMP - .load_balance = load_balance_fair, - .move_one_task = move_one_task_fair, -#endif - - .set_curr_task = set_curr_task_fair, - .task_tick = task_tick_fair, - .task_new = task_new_fair, - - .prio_changed = prio_changed_fair, - .switched_to = switched_to_fair, - -#ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, -#endif -}; - -#ifdef CONFIG_SCHED_DEBUG -static void print_cfs_stats(struct seq_file *m, int cpu) -{ - struct cfs_rq *cfs_rq; - - rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) - print_cfs_rq(m, cpu, cfs_rq); - rcu_read_unlock(); -} -#endif -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched_fair.c) - */ - -#ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) -{ - return task_cpu(p); /* IDLE tasks as never migrated */ -} -#endif /* CONFIG_SMP */ -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) -{ - resched_task(rq->idle); -} - -static struct task_struct *pick_next_task_idle(struct rq *rq) -{ - schedstat_inc(rq, sched_goidle); - - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) -{ - spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - -#ifdef CONFIG_SMP -static unsigned long -load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - return 0; -} - -static int -move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - return 0; -} -#endif - -static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_idle(struct rq *rq) -{ -} - -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) -{ - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); -} - -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) -{ - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -static const struct sched_class idle_sched_class = { - /* .next is NULL */ - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, -#endif /* CONFIG_SMP */ - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - -#ifdef CONFIG_SMP - .load_balance = load_balance_idle, - .move_one_task = move_one_task_idle, -#endif - - .set_curr_task = set_curr_task_idle, - .task_tick = task_tick_idle, - - .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ -}; -/* - * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR - * policies) - */ - -#ifdef CONFIG_SMP - -static inline int rt_overloaded(struct rq *rq) -{ - return atomic_read(&rq->rd->rto_count); -} - -static inline void rt_set_overload(struct rq *rq) -{ - if (!rq->online) - return; - - cpu_set(rq->cpu, rq->rd->rto_mask); - /* - * Make sure the mask is visible before we set - * the overload count. That is checked to determine - * if we should look at the mask. It would be a shame - * if we looked at the mask, but the mask was not - * updated yet. - */ - wmb(); - atomic_inc(&rq->rd->rto_count); -} - -static inline void rt_clear_overload(struct rq *rq) -{ - if (!rq->online) - return; - - /* the order here really doesn't matter */ - atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); -} - -static void update_rt_migration(struct rq *rq) -{ - if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { - if (!rq->rt.overloaded) { - rt_set_overload(rq); - rq->rt.overloaded = 1; - } - } else if (rq->rt.overloaded) { - rt_clear_overload(rq); - rq->rt.overloaded = 0; - } -} -#endif /* CONFIG_SMP */ - -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) -{ - return container_of(rt_se, struct task_struct, rt); -} - -static inline int on_rt_rq(struct sched_rt_entity *rt_se) -{ - return !list_empty(&rt_se->run_list); -} - -#ifdef CONFIG_RT_GROUP_SCHED - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - if (!rt_rq->tg) - return RUNTIME_INF; - - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return rt_rq->rq; -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - return rt_se->rt_rq; -} - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = rt_se->parent) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return rt_se->my_q; -} - -static void enqueue_rt_entity(struct sched_rt_entity *rt_se); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); - -static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - struct sched_rt_entity *rt_se = rt_rq->rt_se; - - if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se); - if (rt_rq->highest_prio < curr->prio) - resched_task(curr); - } -} - -static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - struct sched_rt_entity *rt_se = rt_rq->rt_se; - - if (rt_se && on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - -static int rt_se_boosted(struct sched_rt_entity *rt_se) -{ - struct rt_rq *rt_rq = group_rt_rq(rt_se); - struct task_struct *p; - - if (rt_rq) - return !!rt_rq->rt_nr_boosted; - - p = rt_task_of(rt_se); - return p->prio != p->normal_prio; -} - -#ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) -{ - return cpu_rq(smp_processor_id())->rd->span; -} -#else -static inline cpumask_t sched_rt_period_mask(void) -{ - return cpu_online_map; -} -#endif - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &rt_rq->tg->rt_bandwidth; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return container_of(rt_rq, struct rq, rt); -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); - - return &rq->rt; -} - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline cpumask_t sched_rt_period_mask(void) -{ - return cpu_online_map; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP -static int do_balance_runtime(struct rt_rq *rt_rq) -{ - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - int i, weight, more = 0; - u64 rt_period; - - weight = cpus_weight(rd->span); - - spin_lock(&rt_b->rt_runtime_lock); - rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { - struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); - s64 diff; - - if (iter == rt_rq) - continue; - - spin_lock(&iter->rt_runtime_lock); - if (iter->rt_runtime == RUNTIME_INF) - goto next; - - diff = iter->rt_runtime - iter->rt_time; - if (diff > 0) { - diff = div_u64((u64)diff, weight); - if (rt_rq->rt_runtime + diff > rt_period) - diff = rt_period - rt_rq->rt_runtime; - iter->rt_runtime -= diff; - rt_rq->rt_runtime += diff; - more = 1; - if (rt_rq->rt_runtime == rt_period) { - spin_unlock(&iter->rt_runtime_lock); - break; - } - } -next: - spin_unlock(&iter->rt_runtime_lock); - } - spin_unlock(&rt_b->rt_runtime_lock); - - return more; -} - -static void __disable_runtime(struct rq *rq) -{ - struct root_domain *rd = rq->rd; - struct rt_rq *rt_rq; - - if (unlikely(!scheduler_running)) - return; - - for_each_leaf_rt_rq(rt_rq, rq) { - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - s64 want; - int i; - - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_runtime == RUNTIME_INF || - rt_rq->rt_runtime == rt_b->rt_runtime) - goto balanced; - spin_unlock(&rt_rq->rt_runtime_lock); - - want = rt_b->rt_runtime - rt_rq->rt_runtime; - - for_each_cpu_mask(i, rd->span) { - struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); - s64 diff; - - if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) - continue; - - spin_lock(&iter->rt_runtime_lock); - if (want > 0) { - diff = min_t(s64, iter->rt_runtime, want); - iter->rt_runtime -= diff; - want -= diff; - } else { - iter->rt_runtime -= want; - want -= want; - } - spin_unlock(&iter->rt_runtime_lock); - - if (!want) - break; - } - - spin_lock(&rt_rq->rt_runtime_lock); - BUG_ON(want); -balanced: - rt_rq->rt_runtime = RUNTIME_INF; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); - } -} - -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); -} - -static void __enable_runtime(struct rq *rq) -{ - struct rt_rq *rt_rq; - - if (unlikely(!scheduler_running)) - return; - - for_each_leaf_rt_rq(rt_rq, rq) { - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_b->rt_runtime; - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); - } -} - -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); -} - -static int balance_runtime(struct rt_rq *rt_rq) -{ - int more = 0; - - if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - } - - return more; -} -#else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} -#endif /* CONFIG_SMP */ - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -{ - int i, idle = 1; - cpumask_t span; - - if (rt_b->rt_runtime == RUNTIME_INF) - return 1; - - span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { - int enqueue = 0; - struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); - struct rq *rq = rq_of_rt_rq(rt_rq); - - spin_lock(&rq->lock); - if (rt_rq->rt_time) { - u64 runtime; - - spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_throttled) - balance_runtime(rt_rq); - runtime = rt_rq->rt_runtime; - rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - enqueue = 1; - } - if (rt_rq->rt_time || rt_rq->rt_nr_running) - idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - } else if (rt_rq->rt_nr_running) - idle = 0; - - if (enqueue) - sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); - } - - return idle; -} - -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio; -#endif - - return rt_task_of(rt_se)->prio; -} - -static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) -{ - u64 runtime = sched_rt_runtime(rt_rq); - - if (rt_rq->rt_throttled) - return rt_rq_throttled(rt_rq); - - if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) - return 0; - - balance_runtime(rt_rq); - runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - - if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - if (rt_rq_throttled(rt_rq)) { - sched_rt_rq_dequeue(rt_rq); - return 1; - } - } - - return 0; -} - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static void update_curr_rt(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - u64 delta_exec; - - if (!task_has_rt_policy(curr)) - return; - - delta_exec = rq->clock - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - curr->se.exec_start = rq->clock; - cpuacct_charge(curr, delta_exec); - - for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); - - spin_lock(&rt_rq->rt_runtime_lock); - if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); - } - spin_unlock(&rt_rq->rt_runtime_lock); - } -} - -static inline -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - WARN_ON(!rt_prio(rt_se_prio(rt_se))); - rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) { -#ifdef CONFIG_SMP - struct rq *rq = rq_of_rt_rq(rt_rq); -#endif - - rt_rq->highest_prio = rt_se_prio(rt_se); -#ifdef CONFIG_SMP - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_se_prio(rt_se)); -#endif - } -#endif -#ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(rq_of_rt_rq(rt_rq)); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - if (rt_se_boosted(rt_se)) - rt_rq->rt_nr_boosted++; - - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); -#else - start_rt_bandwidth(&def_rt_bandwidth); -#endif -} - -static inline -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -#ifdef CONFIG_SMP - int highest_prio = rt_rq->highest_prio; -#endif - - WARN_ON(!rt_prio(rt_se_prio(rt_se))); - WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_rq->rt_nr_running) { - struct rt_prio_array *array; - - WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); - if (rt_se_prio(rt_se) == rt_rq->highest_prio) { - /* recalculate */ - array = &rt_rq->active; - rt_rq->highest_prio = - sched_find_first_bit(array->bitmap); - } /* otherwise leave rq->highest prio alone */ - } else - rt_rq->highest_prio = MAX_RT_PRIO; -#endif -#ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); - rq->rt.rt_nr_migratory--; - } - - if (rt_rq->highest_prio != highest_prio) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_rq->highest_prio); - } - - update_rt_migration(rq_of_rt_rq(rt_rq)); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_RT_GROUP_SCHED - if (rt_se_boosted(rt_se)) - rt_rq->rt_nr_boosted--; - - WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); -#endif -} - -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) -{ - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - struct rt_prio_array *array = &rt_rq->active; - struct rt_rq *group_rq = group_rt_rq(rt_se); - struct list_head *queue = array->queue + rt_se_prio(rt_se); - - /* - * Don't enqueue the group if its throttled, or when empty. - * The latter is a consequence of the former when a child group - * get throttled and the current group doesn't have any other - * active members. - */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) - return; - - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); - - inc_rt_tasks(rt_se, rt_rq); -} - -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) -{ - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - struct rt_prio_array *array = &rt_rq->active; - - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); - - dec_rt_tasks(rt_se, rt_rq); -} - -/* - * Because the prio of an upper entry depends on the lower - * entries, we must remove entries top - down. - */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) -{ - struct sched_rt_entity *back = NULL; - - for_each_sched_rt_entity(rt_se) { - rt_se->back = back; - back = rt_se; - } - - for (rt_se = back; rt_se; rt_se = rt_se->back) { - if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); - } -} - -static void enqueue_rt_entity(struct sched_rt_entity *rt_se) -{ - dequeue_rt_stack(rt_se); - for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se); -} - -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) -{ - dequeue_rt_stack(rt_se); - - for_each_sched_rt_entity(rt_se) { - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se); - } -} - -/* - * Adding/removing a task to/from a priority array: - */ -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) -{ - struct sched_rt_entity *rt_se = &p->rt; - - if (wakeup) - rt_se->timeout = 0; - - enqueue_rt_entity(rt_se); - - inc_cpu_load(rq, p->se.load.weight); -} - -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) -{ - struct sched_rt_entity *rt_se = &p->rt; - - update_curr_rt(rq); - dequeue_rt_entity(rt_se); - - dec_cpu_load(rq, p->se.load.weight); -} - -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void -requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) -{ - if (on_rt_rq(rt_se)) { - struct rt_prio_array *array = &rt_rq->active; - struct list_head *queue = array->queue + rt_se_prio(rt_se); - - if (head) - list_move(&rt_se->run_list, queue); - else - list_move_tail(&rt_se->run_list, queue); - } -} - -static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) -{ - struct sched_rt_entity *rt_se = &p->rt; - struct rt_rq *rt_rq; - - for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); - requeue_rt_entity(rt_rq, rt_se, head); - } -} - -static void yield_task_rt(struct rq *rq) -{ - requeue_task_rt(rq, rq->curr, 0); -} - -#ifdef CONFIG_SMP -static int find_lowest_rq(struct task_struct *task); - -static int select_task_rq_rt(struct task_struct *p, int sync) -{ - struct rq *rq = task_rq(p); - - /* - * If the current task is an RT task, then - * try to see if we can wake this RT task up on another - * runqueue. Otherwise simply start this RT task - * on its current runqueue. - * - * We want to avoid overloading runqueues. Even if - * the RT task is of higher priority than the current RT task. - * RT tasks behave differently than other tasks. If - * one gets preempted, we try to push it off to another queue. - * So trying to keep a preempting RT task on the same - * cache hot CPU will force the running RT task to - * a cold CPU. So we waste all the cache for the lower - * RT task in hopes of saving some of a RT task - * that is just being woken and probably will have - * cold cache anyway. - */ - if (unlikely(rt_task(rq->curr)) && - (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); - - return (cpu == -1) ? task_cpu(p) : cpu; - } - - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); -} - -static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) -{ - cpumask_t mask; - - if (rq->curr->rt.nr_cpus_allowed == 1) - return; - - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) - return; - - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; - - /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: - */ - requeue_task_rt(rq, p, 1); - resched_task(rq->curr); -} - -#endif /* CONFIG_SMP */ - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) -{ - if (p->prio < rq->curr->prio) { - resched_task(rq->curr); - return; - } - -#ifdef CONFIG_SMP - /* - * If: - * - * - the newly woken task is of equal priority to the current task - * - the newly woken task is non-migratable while current is migratable - * - current will be preempted on the next reschedule - * - * we should check to see if current can readily move to a different - * cpu. If so, we will reschedule to allow the push logic to try - * to move current somewhere else, making room for our non-migratable - * task. - */ - if (p->prio == rq->curr->prio && !need_resched()) - check_preempt_equal_prio(rq, p); -#endif -} - -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) -{ - struct rt_prio_array *array = &rt_rq->active; - struct sched_rt_entity *next = NULL; - struct list_head *queue; - int idx; - - idx = sched_find_first_bit(array->bitmap); - BUG_ON(idx >= MAX_RT_PRIO); - - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); - - return next; -} - -static struct task_struct *pick_next_task_rt(struct rq *rq) -{ - struct sched_rt_entity *rt_se; - struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (unlikely(!rt_rq->rt_nr_running)) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; - - do { - rt_se = pick_next_rt_entity(rq, rt_rq); - BUG_ON(!rt_se); - rt_rq = group_rt_rq(rt_se); - } while (rt_rq); - - p = rt_task_of(rt_se); - p->se.exec_start = rq->clock; - return p; -} - -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) -{ - update_curr_rt(rq); - p->se.exec_start = 0; -} - -#ifdef CONFIG_SMP - -/* Only try algorithms three times */ -#define RT_MAX_TRIES 3 - -static int double_lock_balance(struct rq *this_rq, struct rq *busiest); -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); - -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); - -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && - (p->rt.nr_cpus_allowed > 1)) - return 1; - return 0; -} - -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) -{ - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; - - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); - next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio < idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } - } - - return next; -} - -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); - -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) - return this_cpu; - - first = first_cpu(*mask); - if (first != NR_CPUS) - return first; - - return -1; -} - -static int find_lowest_rq(struct task_struct *task) -{ - struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); - int this_cpu = smp_processor_id(); - int cpu = task_cpu(task); - - if (task->rt.nr_cpus_allowed == 1) - return -1; /* No other targets possible */ - - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) - return -1; /* No targets found */ - - /* - * Only consider CPUs that are usable for migration. - * I guess we might want to change cpupri_find() to ignore those - * in the first place. - */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); - - /* - * At this point we have built a mask of cpus representing the - * lowest priority tasks in the system. Now we want to elect - * the best one based on our affinity and topology. - * - * We prioritize the last cpu that the task executed on since - * it is most likely cache-hot in that location. - */ - if (cpu_isset(cpu, *lowest_mask)) - return cpu; - - /* - * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. - */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - cpumask_t domain_mask; - int best_cpu; - - cpus_and(domain_mask, sd->span, *lowest_mask); - - best_cpu = pick_optimal_cpu(this_cpu, - &domain_mask); - if (best_cpu != -1) - return best_cpu; - } - } - - /* - * And finally, if there were no matches within the domains - * just give the caller *something* to work with from the compatible - * locations. - */ - return pick_optimal_cpu(this_cpu, lowest_mask); -} - -/* Will lock the rq it finds */ -static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) -{ - struct rq *lowest_rq = NULL; - int tries; - int cpu; - - for (tries = 0; tries < RT_MAX_TRIES; tries++) { - cpu = find_lowest_rq(task); - - if ((cpu == -1) || (cpu == rq->cpu)) - break; - - lowest_rq = cpu_rq(cpu); - - /* if the prio of this runqueue changed, try again */ - if (double_lock_balance(rq, lowest_rq)) { - /* - * We had to unlock the run queue. In - * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. - */ - if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || - task_running(rq, task) || - !task->se.on_rq)) { - - spin_unlock(&lowest_rq->lock); - lowest_rq = NULL; - break; - } - } - - /* If this rq is still suitable use it. */ - if (lowest_rq->rt.highest_prio > task->prio) - break; - - /* try again */ - double_unlock_balance(rq, lowest_rq); - lowest_rq = NULL; - } - - return lowest_rq; -} - -/* - * If the current CPU has more than one RT task, see if the non - * running task can migrate over to a CPU that is running a task - * of lesser priority. - */ -static int push_rt_task(struct rq *rq) -{ - struct task_struct *next_task; - struct rq *lowest_rq; - int ret = 0; - int paranoid = RT_MAX_TRIES; - - if (!rq->rt.overloaded) - return 0; - - next_task = pick_next_highest_task_rt(rq, -1); - if (!next_task) - return 0; - - retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); - return 0; - } - - /* - * It's possible that the next_task slipped in of - * higher priority than current. If that's the case - * just reschedule current. - */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); - return 0; - } - - /* We might release rq lock */ - get_task_struct(next_task); - - /* find_lock_lowest_rq locks the rq if found */ - lowest_rq = find_lock_lowest_rq(next_task, rq); - if (!lowest_rq) { - struct task_struct *task; - /* - * find lock_lowest_rq releases rq->lock - * so it is possible that next_task has changed. - * If it has, then try again. - */ - task = pick_next_highest_task_rt(rq, -1); - if (unlikely(task != next_task) && task && paranoid--) { - put_task_struct(next_task); - next_task = task; - goto retry; - } - goto out; - } - - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); - - resched_task(lowest_rq->curr); - - double_unlock_balance(rq, lowest_rq); - - ret = 1; -out: - put_task_struct(next_task); - - return ret; -} - -/* - * TODO: Currently we just use the second highest prio task on - * the queue, and stop when it can't migrate (or there's - * no more RT tasks). There may be a case where a lower - * priority RT task has a different affinity than the - * higher RT task. In this case the lower RT task could - * possibly be able to migrate where as the higher priority - * RT task could not. We currently ignore this issue. - * Enhancements are welcome! - */ -static void push_rt_tasks(struct rq *rq) -{ - /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) - ; -} - -static int pull_rt_task(struct rq *this_rq) -{ - int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p, *next; - struct rq *src_rq; - - if (likely(!rt_overloaded(this_rq))) - return 0; - - next = pick_next_task_rt(this_rq); - - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { - if (this_cpu == cpu) - continue; - - src_rq = cpu_rq(cpu); - /* - * We can potentially drop this_rq's lock in - * double_lock_balance, and another CPU could - * steal our next task - hence we must cause - * the caller to recalculate the next task - * in that case: - */ - if (double_lock_balance(this_rq, src_rq)) { - struct task_struct *old_next = next; - - next = pick_next_task_rt(this_rq); - if (next != old_next) - ret = 1; - } - - /* - * Are there still pullable RT tasks? - */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); - - /* - * Do we have an RT task that preempts - * the to-be-scheduled task? - */ - if (p && (!next || (p->prio < next->prio))) { - WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); - - /* - * There's a chance that p is higher in priority - * than what's currently running on its cpu. - * This is just that p is wakeing up and hasn't - * had a chance to schedule. We only pull - * p if it is lower in priority than the - * current task on the run queue or - * this_rq next task is lower in prio than - * the current task on that rq. - */ - if (p->prio < src_rq->curr->prio || - (next && next->prio < src_rq->curr->prio)) - goto skip; - - ret = 1; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - /* - * We continue with the search, just in - * case there's an even higher prio task - * in another runqueue. (low likelyhood - * but possible) - * - * Update next so that we won't pick a task - * on another cpu with a priority lower (or equal) - * than the one we just picked. - */ - next = p; - - } - skip: - double_unlock_balance(this_rq, src_rq); - } - - return ret; -} - -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) - pull_rt_task(rq); -} - -static void post_schedule_rt(struct rq *rq) -{ - /* - * If we have more than one rt_task queued, then - * see if we can push the other rt_tasks off to other CPUS. - * Note we may release the rq lock, and since - * the lock was owned by prev, we need to release it - * first via finish_lock_switch and then reaquire it here. - */ - if (unlikely(rq->rt.overloaded)) { - spin_lock_irq(&rq->lock); - push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); - } -} - -/* - * If we are not running and we are not going to reschedule soon, we should - * try to push tasks away now - */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) -{ - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded) - push_rt_tasks(rq); -} - -static unsigned long -load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) -{ - /* don't touch RT tasks */ - return 0; -} - -static int -move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - /* don't touch RT tasks */ - return 0; -} - -static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) -{ - int weight = cpus_weight(*new_mask); - - BUG_ON(!rt_task(p)); - - /* - * Update the migration status of the RQ if we have an RT task - * which is running AND changing its weight value. - */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { - struct rq *rq = task_rq(p); - - if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { - rq->rt.rt_nr_migratory++; - } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } - - update_rt_migration(rq); - } - - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = weight; -} - -/* Assumes rq->lock is held */ -static void rq_online_rt(struct rq *rq) -{ - if (rq->rt.overloaded) - rt_set_overload(rq); - - __enable_runtime(rq); - - cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); -} - -/* Assumes rq->lock is held */ -static void rq_offline_rt(struct rq *rq) -{ - if (rq->rt.overloaded) - rt_clear_overload(rq); - - __disable_runtime(rq); - - cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); -} - -/* - * When switch from the rt queue, we bring ourselves to a position - * that we might want to pull RT tasks from other runqueues. - */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) -{ - /* - * If there are other RT tasks then we will reschedule - * and the scheduling of the other RT tasks will handle - * the balancing. But if we are the last RT task - * we may need to handle the pulling of RT tasks - * now. - */ - if (!rq->rt.rt_nr_running) - pull_rt_task(rq); -} -#endif /* CONFIG_SMP */ - -/* - * When switching a task to RT, we may overload the runqueue - * with RT tasks. In this case we try to push them off to - * other runqueues. - */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) -{ - int check_resched = 1; - - /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task - * then see if we can move to another run queue. - */ - if (!running) { -#ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && - /* Don't resched if we changed runqueues */ - rq != task_rq(p)) - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); - } -} - -/* - * Priority of the task has changed. This may cause - * us to initiate a push or pull. - */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) -{ - if (running) { -#ifdef CONFIG_SMP - /* - * If our priority decreases while running, we - * may need to pull tasks to this runqueue. - */ - if (oldprio < p->prio) - pull_rt_task(rq); - /* - * If there's a higher priority task waiting to run - * then reschedule. Note, the above pull_rt_task - * can release the rq lock and p could migrate. - * Only reschedule if p is still on the same runqueue. - */ - if (p->prio > rq->rt.highest_prio && rq->curr == p) - resched_task(p); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_task(p); -#endif /* CONFIG_SMP */ - } else { - /* - * This task is not running, but if it is - * greater than the current running task - * then reschedule. - */ - if (p->prio < rq->curr->prio) - resched_task(rq->curr); - } -} - -static void watchdog(struct rq *rq, struct task_struct *p) -{ - unsigned long soft, hard; - - if (!p->signal) - return; - - soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; - hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; - - if (soft != RLIM_INFINITY) { - unsigned long next; - - p->rt.timeout++; - next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); - if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; - } -} - -static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) -{ - update_curr_rt(rq); - - watchdog(rq, p); - - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if (p->policy != SCHED_RR) - return; - - if (--p->rt.time_slice) - return; - - p->rt.time_slice = DEF_TIMESLICE; - - /* - * Requeue to the end of queue if we are not the only element - * on the queue: - */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); - } -} - -static void set_curr_task_rt(struct rq *rq) -{ - struct task_struct *p = rq->curr; - - p->se.exec_start = rq->clock; -} - -static const struct sched_class rt_sched_class = { - .next = &fair_sched_class, - .enqueue_task = enqueue_task_rt, - .dequeue_task = dequeue_task_rt, - .yield_task = yield_task_rt, -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_rt, -#endif /* CONFIG_SMP */ - - .check_preempt_curr = check_preempt_curr_rt, - - .pick_next_task = pick_next_task_rt, - .put_prev_task = put_prev_task_rt, - -#ifdef CONFIG_SMP - .load_balance = load_balance_rt, - .move_one_task = move_one_task_rt, - .set_cpus_allowed = set_cpus_allowed_rt, - .rq_online = rq_online_rt, - .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, - .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, - .switched_from = switched_from_rt, -#endif - - .set_curr_task = set_curr_task_rt, - .task_tick = task_tick_rt, - - .prio_changed = prio_changed_rt, - .switched_to = switched_to_rt, -}; - -#ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); - -static void print_rt_stats(struct seq_file *m, int cpu) -{ - struct rt_rq *rt_rq; - - rcu_read_lock(); - for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) - print_rt_rq(m, cpu, rt_rq); - rcu_read_unlock(); -} -#endif /* CONFIG_SCHED_DEBUG */ -/* - * linux/kernel/seccomp.c - * - * Copyright 2004-2005 Andrea Arcangeli - * - * This defines a simple but solid secure-computing mode. - */ - -#include -#include -#include - -/* #define SECCOMP_DEBUG 1 */ -#define NR_SECCOMP_MODES 1 - -/* - * Secure computing mode 1 allows only read/write/exit/sigreturn. - * To be fully secure this must be combined with rlimit - * to limit the stack allocations too. - */ -static int mode1_syscalls[] = { - __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ -}; - -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - -void __secure_computing(int this_syscall) -{ - int mode = current->seccomp.mode; - int * syscall; - - switch (mode) { - case 1: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; -#endif - do { - if (*syscall == this_syscall) - return; - } while (*++syscall); - break; - default: - BUG(); - } - -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - do_exit(SIGKILL); -} - -long prctl_get_seccomp(void) -{ - return current->seccomp.mode; -} - -long prctl_set_seccomp(unsigned long seccomp_mode) -{ - long ret; - - /* can set it only once to be even more secure */ - ret = -EPERM; - if (unlikely(current->seccomp.mode)) - goto out; - - ret = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -#ifdef TIF_NOTSC - disable_TSC(); -#endif - ret = 0; - } - - out: - return ret; -} -/* - * Copyright (c) 2008 Intel Corporation - * Author: Matthew Wilcox - * - * Distributed under the terms of the GNU GPL, version 2 - * - * This file implements counting semaphores. - * A counting semaphore may be acquired 'n' times before sleeping. - * See mutex.c for single-acquisition sleeping locks which enforce - * rules which allow code to be debugged more easily. - */ - -/* - * Some notes on the implementation: - * - * The spinlock controls access to the other members of the semaphore. - * down_trylock() and up() can be called from interrupt context, so we - * have to disable interrupts when taking the lock. It turns out various - * parts of the kernel expect to be able to use down() on a semaphore in - * interrupt context when they know it will succeed, so we have to use - * irqsave variants for down(), down_interruptible() and down_killable() - * too. - * - * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. - */ - -#include -#include -#include -#include -#include -#include -#include - -static noinline void __down(struct semaphore *sem); -static noinline int __down_interruptible(struct semaphore *sem); -static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); -static noinline void __up(struct semaphore *sem); - -/** - * down - acquire the semaphore - * @sem: the semaphore to be acquired - * - * Acquires the semaphore. If no more tasks are allowed to acquire the - * semaphore, calling this function will put the task to sleep until the - * semaphore is released. - * - * Use of this function is deprecated, please use down_interruptible() or - * down_killable() instead. - */ -void down(struct semaphore *sem) -{ - unsigned long flags; - - spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(down); - -/** - * down_interruptible - acquire the semaphore unless interrupted - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a signal, this function will return -EINTR. - * If the semaphore is successfully acquired, this function returns 0. - */ -int down_interruptible(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_interruptible); - -/** - * down_killable - acquire the semaphore unless killed - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a fatal signal, this function will return - * -EINTR. If the semaphore is successfully acquired, this function returns - * 0. - */ -int down_killable(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_killable); - -/** - * down_trylock - try to acquire the semaphore, without waiting - * @sem: the semaphore to be acquired - * - * Try to acquire the semaphore atomically. Returns 0 if the mutex has - * been acquired successfully or 1 if it it cannot be acquired. - * - * NOTE: This return value is inverted from both spin_trylock and - * mutex_trylock! Be careful about this when converting code. - * - * Unlike mutex_trylock, this function can be used from interrupt context, - * and the semaphore can be released by any task or interrupt. - */ -int down_trylock(struct semaphore *sem) -{ - unsigned long flags; - int count; - - spin_lock_irqsave(&sem->lock, flags); - count = sem->count - 1; - if (likely(count >= 0)) - sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); - - return (count < 0); -} -EXPORT_SYMBOL(down_trylock); - -/** - * down_timeout - acquire the semaphore within a specified time - * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the semaphore is not released within the specified number of jiffies, - * this function returns -ETIME. It returns 0 if the semaphore was acquired. - */ -int down_timeout(struct semaphore *sem, long jiffies) -{ - unsigned long flags; - int result = 0; - - spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_timeout); - -/** - * up - release the semaphore - * @sem: the semaphore to release - * - * Release the semaphore. Unlike mutexes, up() may be called from any - * context and even by tasks which have never called down(). - */ -void up(struct semaphore *sem) -{ - unsigned long flags; - - spin_lock_irqsave(&sem->lock, flags); - if (likely(list_empty(&sem->wait_list))) - sem->count++; - else - __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(up); - -/* Functions for the contended case */ - -struct semaphore_waiter { - struct list_head list; - struct task_struct *task; - int up; -}; - -/* - * Because this function is inlined, the 'state' parameter will be - * constant, and thus optimised away by the compiler. Likewise the - * 'timeout' parameter for the cases without timeouts. - */ -static inline int __sched __down_common(struct semaphore *sem, long state, - long timeout) -{ - struct task_struct *task = current; - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; - waiter.up = 0; - - for (;;) { - if (signal_pending_state(state, task)) - goto interrupted; - if (timeout <= 0) - goto timed_out; - __set_task_state(task, state); - spin_unlock_irq(&sem->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); - if (waiter.up) - return 0; - } - - timed_out: - list_del(&waiter.list); - return -ETIME; - - interrupted: - list_del(&waiter.list); - return -EINTR; -} - -static noinline void __sched __down(struct semaphore *sem) -{ - __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_interruptible(struct semaphore *sem) -{ - return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_killable(struct semaphore *sem) -{ - return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) -{ - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); -} - -static noinline void __sched __up(struct semaphore *sem) -{ - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); - waiter->up = 1; - wake_up_process(waiter->task); -} -/* - * linux/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson - * - * 2003-06-02 Jim Houston - Concurrent Computer Corp. - * Changes to use preallocated sigqueue structures - * to allow signals to be sent reliably. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include "audit.h" /* audit_signal_info() */ - -/* - * SLAB caches for signal bits. - */ - -static struct kmem_cache *sigqueue_cachep; - -static void __user *sig_handler(struct task_struct *t, int sig) -{ - return t->sighand->action[sig - 1].sa.sa_handler; -} - -static int sig_handler_ignored(void __user *handler, int sig) -{ - /* Is it explicitly or implicitly ignored? */ - return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); -} - -static int sig_ignored(struct task_struct *t, int sig) -{ - void __user *handler; - - /* - * Blocked signals are never ignored, since the - * signal handler may change by the time it is - * unblocked. - */ - if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; - - handler = sig_handler(t, sig); - if (!sig_handler_ignored(handler, sig)) - return 0; - - /* - * Tracers may want to know about even ignored signals. - */ - return !tracehook_consider_ignored_signal(t, sig, handler); -} - -/* - * Re-calculate pending state from the set of locally pending - * signals, globally pending signals, and blocked signals. - */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) -{ - unsigned long ready; - long i; - - switch (_NSIG_WORDS) { - default: - for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) - ready |= signal->sig[i] &~ blocked->sig[i]; - break; - - case 4: ready = signal->sig[3] &~ blocked->sig[3]; - ready |= signal->sig[2] &~ blocked->sig[2]; - ready |= signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 2: ready = signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 1: ready = signal->sig[0] &~ blocked->sig[0]; - } - return ready != 0; -} - -#define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) - -static int recalc_sigpending_tsk(struct task_struct *t) -{ - if (t->signal->group_stop_count > 0 || - PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; - } - /* - * We must never clear the flag in another thread, or in current - * when it's possible the current syscall is returning -ERESTART*. - * So we don't clear it here, and only callers who know they should do. - */ - return 0; -} - -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - -void recalc_sigpending(void) -{ - if (unlikely(tracehook_force_sigpending())) - set_thread_flag(TIF_SIGPENDING); - else if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - -} - -/* Given the mask, find the first available signal that should be serviced. */ - -int next_signal(struct sigpending *pending, sigset_t *mask) -{ - unsigned long i, *s, *m, x; - int sig = 0; - - s = pending->signal.sig; - m = mask->sig; - switch (_NSIG_WORDS) { - default: - for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) - if ((x = *s &~ *m) != 0) { - sig = ffz(~x) + i*_NSIG_BPW + 1; - break; - } - break; - - case 2: if ((x = s[0] &~ m[0]) != 0) - sig = 1; - else if ((x = s[1] &~ m[1]) != 0) - sig = _NSIG_BPW + 1; - else - break; - sig += ffz(~x); - break; - - case 1: if ((x = *s &~ *m) != 0) - sig = ffz(~x) + 1; - break; - } - - return sig; -} - -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) -{ - struct sigqueue *q = NULL; - struct user_struct *user; - - /* - * In order to avoid problems with "switch_user()", we want to make - * sure that the compiler doesn't re-load "t->user" - */ - user = t->user; - barrier(); - atomic_inc(&user->sigpending); - if (override_rlimit || - atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) - q = kmem_cache_alloc(sigqueue_cachep, flags); - if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - } else { - INIT_LIST_HEAD(&q->list); - q->flags = 0; - q->user = get_uid(user); - } - return(q); -} - -static void __sigqueue_free(struct sigqueue *q) -{ - if (q->flags & SIGQUEUE_PREALLOC) - return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); - kmem_cache_free(sigqueue_cachep, q); -} - -void flush_sigqueue(struct sigpending *queue) -{ - struct sigqueue *q; - - sigemptyset(&queue->signal); - while (!list_empty(&queue->list)) { - q = list_entry(queue->list.next, struct sigqueue , list); - list_del_init(&q->list); - __sigqueue_free(q); - } -} - -/* - * Flush all pending signals for a task. - */ -void flush_signals(struct task_struct *t) -{ - unsigned long flags; - - spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); - spin_unlock_irqrestore(&t->sighand->siglock, flags); -} - -static void __flush_itimer_signals(struct sigpending *pending) -{ - sigset_t signal, retain; - struct sigqueue *q, *n; - - signal = pending->signal; - sigemptyset(&retain); - - list_for_each_entry_safe(q, n, &pending->list, list) { - int sig = q->info.si_signo; - - if (likely(q->info.si_code != SI_TIMER)) { - sigaddset(&retain, sig); - } else { - sigdelset(&signal, sig); - list_del_init(&q->list); - __sigqueue_free(q); - } - } - - sigorsets(&pending->signal, &signal, &retain); -} - -void flush_itimer_signals(void) -{ - struct task_struct *tsk = current; - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - __flush_itimer_signals(&tsk->pending); - __flush_itimer_signals(&tsk->signal->shared_pending); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); -} - -void ignore_signals(struct task_struct *t) -{ - int i; - - for (i = 0; i < _NSIG; ++i) - t->sighand->action[i].sa.sa_handler = SIG_IGN; - - flush_signals(t); -} - -/* - * Flush all handlers for a task. - */ - -void -flush_signal_handlers(struct task_struct *t, int force_default) -{ - int i; - struct k_sigaction *ka = &t->sighand->action[0]; - for (i = _NSIG ; i != 0 ; i--) { - if (force_default || ka->sa.sa_handler != SIG_IGN) - ka->sa.sa_handler = SIG_DFL; - ka->sa.sa_flags = 0; - sigemptyset(&ka->sa.sa_mask); - ka++; - } -} - -int unhandled_signal(struct task_struct *tsk, int sig) -{ - void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; - if (is_global_init(tsk)) - return 1; - if (handler != SIG_IGN && handler != SIG_DFL) - return 0; - return !tracehook_consider_fatal_signal(tsk, sig, handler); -} - - -/* Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) -{ - struct sigqueue *q, *first = NULL; - - /* - * Collect the siginfo appropriate to this signal. Check if - * there is another siginfo for the same signal. - */ - list_for_each_entry(q, &list->list, list) { - if (q->info.si_signo == sig) { - if (first) - goto still_pending; - first = q; - } - } - - sigdelset(&list->signal, sig); - - if (first) { -still_pending: - list_del_init(&first->list); - copy_siginfo(info, &first->info); - __sigqueue_free(first); - } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. - */ - info->si_signo = sig; - info->si_errno = 0; - info->si_code = 0; - info->si_pid = 0; - info->si_uid = 0; - } -} - -static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) -{ - int sig = next_signal(pending, mask); - - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - - collect_signal(sig, pending, info); - } - - return sig; -} - -/* - * Dequeue a signal and return the element to the caller, which is - * expected to free it. - * - * All callers have to hold the siglock. - */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) -{ - int signr; - - /* We only dequeue private signals from ourselves, we don't let - * signalfd steal them - */ - signr = __dequeue_signal(&tsk->pending, mask, info); - if (!signr) { - signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); - /* - * itimer signal ? - * - * itimers are process shared and we restart periodic - * itimers in the signal delivery path to prevent DoS - * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting - * itimers, as the SIGALRM is a legacy signal and only - * queued once. Changing the restart behaviour to - * restart the timer in the signal dequeue path is - * reducing the timer noise on heavy loaded !highres - * systems too. - */ - if (unlikely(signr == SIGALRM)) { - struct hrtimer *tmr = &tsk->signal->real_timer; - - if (!hrtimer_is_queued(tmr) && - tsk->signal->it_real_incr.tv64 != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); - hrtimer_restart(tmr); - } - } - } - - recalc_sigpending(); - if (!signr) - return 0; - - if (unlikely(sig_kernel_stop(signr))) { - /* - * Set a marker that we have dequeued a stop signal. Our - * caller might release the siglock and then the pending - * stop signal it is about to process is no longer in the - * pending bitmasks, but must still be cleared by a SIGCONT - * (and overruled by a SIGKILL). So those cases clear this - * shared flag after we've set it. Note that this flag may - * remain set after the signal we return is ignored or - * handled. That doesn't matter because its only purpose - * is to alert stop-signal processing code when another - * processor has come along and cleared the flag. - */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; - } - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { - /* - * Release the siglock to ensure proper locking order - * of timer locks outside of siglocks. Note, we leave - * irqs disabled here, since the posix-timers code is - * about to disable them again anyway. - */ - spin_unlock(&tsk->sighand->siglock); - do_schedule_next_timer(info); - spin_lock(&tsk->sighand->siglock); - } - return signr; -} - -/* - * Tell a process that it has a new active signal.. - * - * NOTE! we rely on the previous spin_lock to - * lock interrupts for us! We can only be called with - * "siglock" held, and the local interrupt must - * have been disabled when that got acquired! - * - * No need to set need_resched since signal event passing - * goes through ->blocked - */ -void signal_wake_up(struct task_struct *t, int resume) -{ - unsigned int mask; - - set_tsk_thread_flag(t, TIF_SIGPENDING); - - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable - * case. We don't check t->state here because there is a race with it - * executing another processor and just now entering stopped state. - * By using wake_up_state, we ensure the process will wake up and - * handle its death signal. - */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) - kick_process(t); -} - -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. - */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - sigset_t m; - - sigandsets(&m, mask, &s->signal); - if (sigisemptyset(&m)) - return 0; - - signandsets(&s->signal, &s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (sigismember(mask, q->info.si_signo)) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} - -/* - * Bad permissions for sending the signal - */ -static int check_kill_permission(int sig, struct siginfo *info, - struct task_struct *t) -{ - struct pid *sid; - int error; - - if (!valid_signal(sig)) - return -EINVAL; - - if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) - return 0; - - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - - if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && - (current->uid ^ t->suid) && (current->uid ^ t->uid) && - !capable(CAP_KILL)) { - switch (sig) { - case SIGCONT: - sid = task_session(t); - /* - * We don't return the error if sid == NULL. The - * task was unhashed, the caller must notice this. - */ - if (!sid || sid == task_session(current)) - break; - default: - return -EPERM; - } - } - - return security_task_kill(t, info, sig, 0); -} - -/* - * Handle magic process-wide effects of stop/continue signals. Unlike - * the signal actions, these happen immediately at signal-generation - * time regardless of blocking, ignoring, or handling. This does the - * actual continuing for SIGCONT, but not the actual stopping for stop - * signals. The process stop is done as a signal action for SIG_DFL. - * - * Returns true if the signal should be actually delivered, otherwise - * it should be dropped. - */ -static int prepare_signal(int sig, struct task_struct *p) -{ - struct signal_struct *signal = p->signal; - struct task_struct *t; - - if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { - /* - * The process is in the middle of dying, nothing to do. - */ - } else if (sig_kernel_stop(sig)) { - /* - * This is a stop signal. Remove SIGCONT from all queues. - */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); - } else if (sig == SIGCONT) { - unsigned int why; - /* - * Remove all stop signals from all queues, - * and wake all threads. - */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { - unsigned int state; - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - /* - * If there is a handler for SIGCONT, we must make - * sure that no thread returns to user mode before - * we post the signal, in case it was the only - * thread eligible to run the signal handler--then - * it must not do anything between resuming and - * running the handler. With the TIF_SIGPENDING - * flag set, the thread will pause and acquire the - * siglock that we hold now and until we've queued - * the pending signal. - * - * Wake up the stopped thread _after_ setting - * TIF_SIGPENDING - */ - state = __TASK_STOPPED; - if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - state |= TASK_INTERRUPTIBLE; - } - wake_up_state(t, state); - } while_each_thread(p, t); - - /* - * Notify the parent with CLD_CONTINUED if we were stopped. - * - * If we were in the middle of a group stop, we pretend it - * was already finished, and then continued. Since SIGCHLD - * doesn't queue we report only CLD_STOPPED, as if the next - * CLD_CONTINUED was dropped. - */ - why = 0; - if (signal->flags & SIGNAL_STOP_STOPPED) - why |= SIGNAL_CLD_CONTINUED; - else if (signal->group_stop_count) - why |= SIGNAL_CLD_STOPPED; - - if (why) { - /* - * The first thread which returns from finish_stop() - * will take ->siglock, notice SIGNAL_CLD_MASK, and - * notify its parent. See get_signal_to_deliver(). - */ - signal->flags = why | SIGNAL_STOP_CONTINUED; - signal->group_stop_count = 0; - signal->group_exit_code = 0; - } else { - /* - * We are not stopped, but there could be a stop - * signal in the middle of being processed after - * being removed from the queue. Clear that too. - */ - signal->flags &= ~SIGNAL_STOP_DEQUEUED; - } - } - - return !sig_ignored(p, sig); -} - -/* - * Test if P wants to take SIG. After we've checked all threads with this, - * it's equivalent to finding no threads not blocking SIG. Any threads not - * blocking SIG were ruled out because they are not running and already - * have pending signals. Such threads will dequeue from the shared queue - * as soon as they're available, so putting the signal on the shared queue - * will be equivalent to sending it to one such thread. - */ -static inline int wants_signal(int sig, struct task_struct *p) -{ - if (sigismember(&p->blocked, sig)) - return 0; - if (p->flags & PF_EXITING) - return 0; - if (sig == SIGKILL) - return 1; - if (task_is_stopped_or_traced(p)) - return 0; - return task_curr(p) || !signal_pending(p); -} - -static void complete_signal(int sig, struct task_struct *p, int group) -{ - struct signal_struct *signal = p->signal; - struct task_struct *t; - - /* - * Now find a thread we can wake up to take the signal off the queue. - * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. - */ - if (wants_signal(sig, p)) - t = p; - else if (!group || thread_group_empty(p)) - /* - * There is just one thread and it does not need to be woken. - * It will dequeue unblocked signals before it runs again. - */ - return; - else { - /* - * Otherwise try to find a suitable thread. - */ - t = signal->curr_target; - while (!wants_signal(sig, t)) { - t = next_thread(t); - if (t == signal->curr_target) - /* - * No thread needs to be woken. - * Any eligible threads will see - * the signal in the queue soon. - */ - return; - } - signal->curr_target = t; - } - - /* - * Found a killable thread. If the signal will be fatal, - * then start taking the whole group down immediately. - */ - if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && - !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || - !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { - /* - * This signal will be fatal to the whole group. - */ - if (!sig_kernel_coredump(sig)) { - /* - * Start a group exit and wake everybody up. - * This way we don't have other threads - * running and doing things after a slower - * thread has the fatal signal pending. - */ - signal->flags = SIGNAL_GROUP_EXIT; - signal->group_exit_code = sig; - signal->group_stop_count = 0; - t = p; - do { - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - } while_each_thread(p, t); - return; - } - } - - /* - * The signal is already in the shared-pending queue. - * Tell the chosen thread to wake up and dequeue it. - */ - signal_wake_up(t, sig == SIGKILL); - return; -} - -static inline int legacy_queue(struct sigpending *signals, int sig) -{ - return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); -} - -static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) -{ - struct sigpending *pending; - struct sigqueue *q; - - assert_spin_locked(&t->sighand->siglock); - if (!prepare_signal(sig, t)) - return 0; - - pending = group ? &t->signal->shared_pending : &t->pending; - /* - * Short-circuit ignored signals and support queuing - * exactly one non-rt signal, so that we can get more - * detailed information about the cause of the signal. - */ - if (legacy_queue(pending, sig)) - return 0; - /* - * fast-pathed signals for kernel-internal things like SIGSTOP - * or SIGKILL. - */ - if (info == SEND_SIG_FORCED) - goto out_set; - - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - - q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - (is_si_special(info) || - info->si_code >= 0))); - if (q) { - list_add_tail(&q->list, &pending->list); - switch ((unsigned long) info) { - case (unsigned long) SEND_SIG_NOINFO: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); - q->info.si_uid = current->uid; - break; - case (unsigned long) SEND_SIG_PRIV: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_KERNEL; - q->info.si_pid = 0; - q->info.si_uid = 0; - break; - default: - copy_siginfo(&q->info, info); - break; - } - } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) - /* - * Queue overflow, abort. We may abort if the signal was rt - * and sent by user using something other than kill(). - */ - return -EAGAIN; - } - -out_set: - signalfd_notify(t, sig); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); - return 0; -} - -int print_fatal_signals; - -static void print_fatal_signal(struct pt_regs *regs, int signr) -{ - printk("%s/%d: potentially unexpected fatal signal %d.\n", - current->comm, task_pid_nr(current), signr); - -#if defined(__i386__) && !defined(__arch_um__) - printk("code at %08lx: ", regs->ip); - { - int i; - for (i = 0; i < 16; i++) { - unsigned char insn; - - if (get_user(insn, (unsigned char *)(regs->ip + i))) - break; - printk("%02x ", insn); - } - } -#endif - printk("\n"); - show_regs(regs); -} - -static int __init setup_print_fatal_signals(char *str) -{ - get_option (&str, &print_fatal_signals); - - return 1; -} - -__setup("print-fatal-signals=", setup_print_fatal_signals); - -int -__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - return send_signal(sig, info, p, 1); -} - -static int -specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ - return send_signal(sig, info, t, 0); -} - -/* - * Force a signal that the process can't ignore: if necessary - * we unblock the signal and change any SIG_IGN to SIG_DFL. - * - * Note: If we unblock the signal, we always reset it to SIG_DFL, - * since we do not want to have a signal handler that was blocked - * be invoked when user space had explicitly blocked it. - * - * We don't want to have recursive SIGSEGV's etc, for example, - * that is why we also clear SIGNAL_UNKILLABLE. - */ -int -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ - unsigned long int flags; - int ret, blocked, ignored; - struct k_sigaction *action; - - spin_lock_irqsave(&t->sighand->siglock, flags); - action = &t->sighand->action[sig-1]; - ignored = action->sa.sa_handler == SIG_IGN; - blocked = sigismember(&t->blocked, sig); - if (blocked || ignored) { - action->sa.sa_handler = SIG_DFL; - if (blocked) { - sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } - } - if (action->sa.sa_handler == SIG_DFL) - t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = specific_send_sig_info(sig, info, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); - - return ret; -} - -void -force_sig_specific(int sig, struct task_struct *t) -{ - force_sig_info(sig, SEND_SIG_FORCED, t); -} - -/* - * Nuke all other threads in the group. - */ -void zap_other_threads(struct task_struct *p) -{ - struct task_struct *t; - - p->signal->group_stop_count = 0; - - for (t = next_thread(p); t != p; t = next_thread(t)) { - /* - * Don't bother with already dead threads - */ - if (t->exit_state) - continue; - - /* SIGKILL will be handled before any pending SIGSTOP */ - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - } -} - -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - -struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) -{ - struct sighand_struct *sighand; - - rcu_read_lock(); - for (;;) { - sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) - break; - - spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) - break; - spin_unlock_irqrestore(&sighand->siglock, *flags); - } - rcu_read_unlock(); - - return sighand; -} - -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - unsigned long flags; - int ret; - - ret = check_kill_permission(sig, info, p); - - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } - - return ret; -} - -/* - * __kill_pgrp_info() sends a signal to a process group: this is what the tty - * control characters do (^C, ^Z etc) - */ - -int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) -{ - struct task_struct *p = NULL; - int retval, success; - - success = 0; - retval = -ESRCH; - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); - success |= !err; - retval = err; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return success ? 0 : retval; -} - -int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) -{ - int error = -ESRCH; - struct task_struct *p; - - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); - - return error; -} - -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) -{ - int error; - rcu_read_lock(); - error = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return error; -} - -/* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, - uid_t uid, uid_t euid, u32 secid) -{ - int ret = -EINVAL; - struct task_struct *p; - - if (!valid_signal(sig)) - return ret; - - read_lock(&tasklist_lock); - p = pid_task(pid, PIDTYPE_PID); - if (!p) { - ret = -ESRCH; - goto out_unlock; - } - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && (euid != p->suid) && (euid != p->uid) - && (uid != p->suid) && (uid != p->uid)) { - ret = -EPERM; - goto out_unlock; - } - ret = security_task_kill(p, info, sig, secid); - if (ret) - goto out_unlock; - if (sig && p->sighand) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = __group_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -out_unlock: - read_unlock(&tasklist_lock); - return ret; -} -EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); - -/* - * kill_something_info() interprets pid in interesting ways just like kill(2). - * - * POSIX specifies that kill(-1,sig) is unspecified, but what we have - * is probably wrong. Should make it like BSD or SYSV. - */ - -static int kill_something_info(int sig, struct siginfo *info, pid_t pid) -{ - int ret; - - if (pid > 0) { - rcu_read_lock(); - ret = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return ret; - } - - read_lock(&tasklist_lock); - if (pid != -1) { - ret = __kill_pgrp_info(sig, info, - pid ? find_vpid(-pid) : task_pgrp(current)); - } else { - int retval = 0, count = 0; - struct task_struct * p; - - for_each_process(p) { - if (task_pid_vnr(p) > 1 && - !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); - ++count; - if (err != -EPERM) - retval = err; - } - } - ret = count ? retval : -ESRCH; - } - read_unlock(&tasklist_lock); - - return ret; -} - -/* - * These are for backward compatibility with the rest of the kernel source. - */ - -/* - * The caller must ensure the task can't exit. - */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - int ret; - unsigned long flags; - - /* - * Make sure legacy kernel users don't send in bad values - * (normal paths check this in check_kill_permission). - */ - if (!valid_signal(sig)) - return -EINVAL; - - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - -#define __si_special(priv) \ - ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) - -int -send_sig(int sig, struct task_struct *p, int priv) -{ - return send_sig_info(sig, __si_special(priv), p); -} - -void -force_sig(int sig, struct task_struct *p) -{ - force_sig_info(sig, SEND_SIG_PRIV, p); -} - -/* - * When things go south during signal handling, we - * will force a SIGSEGV. And if the signal that caused - * the problem was already a SIGSEGV, we'll want to - * make sure we don't even try to deliver the signal.. - */ -int -force_sigsegv(int sig, struct task_struct *p) -{ - if (sig == SIGSEGV) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } - force_sig(SIGSEGV, p); - return 0; -} - -int kill_pgrp(struct pid *pid, int sig, int priv) -{ - int ret; - - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); - read_unlock(&tasklist_lock); - - return ret; -} -EXPORT_SYMBOL(kill_pgrp); - -int kill_pid(struct pid *pid, int sig, int priv) -{ - return kill_pid_info(sig, __si_special(priv), pid); -} -EXPORT_SYMBOL(kill_pid); - -/* - * These functions support sending signals using preallocated sigqueue - * structures. This is needed "because realtime applications cannot - * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers - * we allocate the sigqueue structure from the timer_create. If this - * allocation fails we are able to report the failure to the application - * with an EAGAIN error. - */ - -struct sigqueue *sigqueue_alloc(void) -{ - struct sigqueue *q; - - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) - q->flags |= SIGQUEUE_PREALLOC; - return(q); -} - -void sigqueue_free(struct sigqueue *q) -{ - unsigned long flags; - spinlock_t *lock = ¤t->sighand->siglock; - - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - /* - * We must hold ->siglock while testing q->list - * to serialize with collect_signal() or with - * __exit_signal()->flush_sigqueue(). - */ - spin_lock_irqsave(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - /* - * If it is queued it will be freed when dequeued, - * like the "regular" sigqueue. - */ - if (!list_empty(&q->list)) - q = NULL; - spin_unlock_irqrestore(lock, flags); - - if (q) - __sigqueue_free(q); -} - -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) -{ - int sig = q->info.si_signo; - struct sigpending *pending; - unsigned long flags; - int ret; - - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - - ret = -1; - if (!likely(lock_task_sighand(t, &flags))) - goto ret; - - ret = 1; /* the signal is ignored */ - if (!prepare_signal(sig, t)) - goto out; - - ret = 0; - if (unlikely(!list_empty(&q->list))) { - /* - * If an SI_TIMER entry is already queue just increment - * the overrun count. - */ - BUG_ON(q->info.si_code != SI_TIMER); - q->info.si_overrun++; - goto out; - } - q->info.si_overrun = 0; - - signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; - list_add_tail(&q->list, &pending->list); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); -out: - unlock_task_sighand(t, &flags); -ret: - return ret; -} - -/* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* - * Let a parent know about the death of a child. - * For a stopped/continued status change, use do_notify_parent_cldstop instead. - * - * Returns -1 if our parent ignored us and so we've switched to - * self-reaping, or else @sig. - */ -int do_notify_parent(struct task_struct *tsk, int sig) -{ - struct siginfo info; - unsigned long flags; - struct sighand_struct *psig; - int ret = sig; - - BUG_ON(sig == -1); - - /* do_notify_parent_cldstop should have been called instead. */ - BUG_ON(task_is_stopped_or_traced(tsk)); - - BUG_ON(!tsk->ptrace && - (tsk->group_leader != tsk || !thread_group_empty(tsk))); - - info.si_signo = sig; - info.si_errno = 0; - /* - * we are under tasklist_lock here so our parent is tied to - * us and cannot exit and release its namespace. - * - * the only it can is to switch its nsproxy with sys_unshare, - * bu uncharing pid namespaces is not allowed, so we'll always - * see relevant namespace - * - * write_lock() currently calls preempt_disable() which is the - * same as rcu_read_lock(), but according to Oleg, this is not - * correct to rely on this - */ - rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - rcu_read_unlock(); - - info.si_uid = tsk->uid; - - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); - - info.si_status = tsk->exit_code & 0x7f; - if (tsk->exit_code & 0x80) - info.si_code = CLD_DUMPED; - else if (tsk->exit_code & 0x7f) - info.si_code = CLD_KILLED; - else { - info.si_code = CLD_EXITED; - info.si_status = tsk->exit_code >> 8; - } - - psig = tsk->parent->sighand; - spin_lock_irqsave(&psig->siglock, flags); - if (!tsk->ptrace && sig == SIGCHLD && - (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || - (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { - /* - * We are exiting and our parent doesn't care. POSIX.1 - * defines special semantics for setting SIGCHLD to SIG_IGN - * or setting the SA_NOCLDWAIT flag: we should be reaped - * automatically and not left for our parent's wait4 call. - * Rather than having the parent do it as a magic kind of - * signal handler, we just set this to tell do_exit that we - * can be cleaned up without becoming a zombie. Note that - * we still call __wake_up_parent in this case, because a - * blocked sys_wait4 might now return -ECHILD. - * - * Whether we send SIGCHLD or not for SA_NOCLDWAIT - * is implementation-defined: we do (if you don't want - * it, just use SIG_IGN instead). - */ - ret = tsk->exit_signal = -1; - if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = -1; - } - if (valid_signal(sig) && sig > 0) - __group_send_sig_info(sig, &info, tsk->parent); - __wake_up_parent(tsk, tsk->parent); - spin_unlock_irqrestore(&psig->siglock, flags); - - return ret; -} - -static void do_notify_parent_cldstop(struct task_struct *tsk, int why) -{ - struct siginfo info; - unsigned long flags; - struct task_struct *parent; - struct sighand_struct *sighand; - - if (tsk->ptrace & PT_PTRACED) - parent = tsk->parent; - else { - tsk = tsk->group_leader; - parent = tsk->real_parent; - } - - info.si_signo = SIGCHLD; - info.si_errno = 0; - /* - * see comment in do_notify_parent() abot the following 3 lines - */ - rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - rcu_read_unlock(); - - info.si_uid = tsk->uid; - - info.si_utime = cputime_to_clock_t(tsk->utime); - info.si_stime = cputime_to_clock_t(tsk->stime); - - info.si_code = why; - switch (why) { - case CLD_CONTINUED: - info.si_status = SIGCONT; - break; - case CLD_STOPPED: - info.si_status = tsk->signal->group_exit_code & 0x7f; - break; - case CLD_TRAPPED: - info.si_status = tsk->exit_code & 0x7f; - break; - default: - BUG(); - } - - sighand = parent->sighand; - spin_lock_irqsave(&sighand->siglock, flags); - if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && - !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); - /* - * Even if SIGCHLD is not generated, we must wake up wait4 calls. - */ - __wake_up_parent(tsk, parent); - spin_unlock_irqrestore(&sighand->siglock, flags); -} - -static inline int may_ptrace_stop(void) -{ - if (!likely(current->ptrace & PT_PTRACED)) - return 0; - /* - * Are we in the middle of do_coredump? - * If so and our tracer is also part of the coredump stopping - * is a deadlock situation, and pointless because our tracer - * is dead so don't allow us to stop. - * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_state != NULL. Otherwise it - * is safe to enter schedule(). - */ - if (unlikely(current->mm->core_state) && - unlikely(current->mm == current->parent->mm)) - return 0; - - return 1; -} - -/* - * Return nonzero if there is a SIGKILL that should be waking us up. - * Called with the siglock held. - */ -static int sigkill_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); -} - -/* - * This must be called with current->sighand->siglock held. - * - * This should be the path for all ptrace stops. - * We always set current->last_siginfo while stopped here. - * That makes it a way to test a stopped process for - * being ptrace-stopped vs being job-control-stopped. - * - * If we actually decide not to stop at all because the tracer - * is gone, we keep current->exit_code unless clear_code. - */ -static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) -{ - if (arch_ptrace_stop_needed(exit_code, info)) { - /* - * The arch code has something special to do before a - * ptrace stop. This is allowed to block, e.g. for faults - * on user stack pages. We can't keep the siglock while - * calling arch_ptrace_stop, so we must release it now. - * To preserve proper semantics, we must do this before - * any signal bookkeeping like checking group_stop_count. - * Meanwhile, a SIGKILL could come in before we retake the - * siglock. That must prevent us from sleeping in TASK_TRACED. - * So after regaining the lock, we must check for SIGKILL. - */ - spin_unlock_irq(¤t->sighand->siglock); - arch_ptrace_stop(exit_code, info); - spin_lock_irq(¤t->sighand->siglock); - if (sigkill_pending(current)) - return; - } - - /* - * If there is a group stop in progress, - * we must participate in the bookkeeping. - */ - if (current->signal->group_stop_count > 0) - --current->signal->group_stop_count; - - current->last_siginfo = info; - current->exit_code = exit_code; - - /* Let the debugger run. */ - __set_current_state(TASK_TRACED); - spin_unlock_irq(¤t->sighand->siglock); - read_lock(&tasklist_lock); - if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, CLD_TRAPPED); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - preempt_enable_no_resched(); - schedule(); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - */ - __set_current_state(TASK_RUNNING); - if (clear_code) - current->exit_code = 0; - read_unlock(&tasklist_lock); - } - - /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - /* - * We are back. Now reacquire the siglock before touching - * last_siginfo, so that we are sure to have synchronized with - * any signal-sending on another CPU that wants to examine it. - */ - spin_lock_irq(¤t->sighand->siglock); - current->last_siginfo = NULL; - - /* - * Queued signals ignored us while we were stopped for tracing. - * So check for any that we should take before resuming user mode. - * This sets TIF_SIGPENDING, but never clears it. - */ - recalc_sigpending_tsk(current); -} - -void ptrace_notify(int exit_code) -{ - siginfo_t info; - - BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - - memset(&info, 0, sizeof info); - info.si_signo = SIGTRAP; - info.si_code = exit_code; - info.si_pid = task_pid_vnr(current); - info.si_uid = current->uid; - - /* Let the debugger run. */ - spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, 1, &info); - spin_unlock_irq(¤t->sighand->siglock); -} - -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - -/* - * This performs the stopping for SIGSTOP and other stop signals. - * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. - * Returns zero if we didn't stop and still hold the siglock. - */ -static int do_signal_stop(int signr) -{ - struct signal_struct *sig = current->signal; - int stop_count; - - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { - struct task_struct *t; - - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || - unlikely(signal_group_exit(sig))) - return 0; - /* - * There is no group stop already in progress. - * We must initiate one now. - */ - sig->group_exit_code = signr; - - stop_count = 0; - for (t = next_thread(current); t != current; t = next_thread(t)) - /* - * Setting state to TASK_STOPPED for a group - * stop is always done with the siglock held, - * so this check has no races. - */ - if (!(t->flags & PF_EXITING) && - !task_is_stopped_or_traced(t)) { - stop_count++; - signal_wake_up(t, 0); - } - sig->group_stop_count = stop_count; - } - - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); - - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); - return 1; -} - -static int ptrace_signal(int signr, siginfo_t *info, - struct pt_regs *regs, void *cookie) -{ - if (!(current->ptrace & PT_PTRACED)) - return signr; - - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ - ptrace_stop(signr, 0, info); - - /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; - if (signr == 0) - return signr; - - current->exit_code = 0; - - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ - if (signr != info->si_signo) { - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; - info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); - signr = 0; - } - - return signr; -} - -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) -{ - struct sighand_struct *sighand = current->sighand; - struct signal_struct *signal = current->signal; - int signr; - -relock: - /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - spin_lock_irq(&sighand->siglock); - /* - * Every stopped thread goes here after wakeup. Check to see if - * we should notify the parent, prepare_signal(SIGCONT) encodes - * the CLD_ si_code into SIGNAL_CLD_MASK bits. - */ - if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - int why = (signal->flags & SIGNAL_STOP_CONTINUED) - ? CLD_CONTINUED : CLD_STOPPED; - signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; - - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); - goto relock; - } - - for (;;) { - struct k_sigaction *ka; - - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) - goto relock; - - /* - * Tracing can induce an artifical signal and choose sigaction. - * The return value in @signr determines the default action, - * but @info->si_signo is the signal number we will report. - */ - signr = tracehook_get_signal(current, regs, info, return_ka); - if (unlikely(signr < 0)) - goto relock; - if (unlikely(signr != 0)) - ka = return_ka; - else { - signr = dequeue_signal(current, ¤t->blocked, - info); - - if (!signr) - break; /* will return 0 */ - - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); - if (!signr) - continue; - } - - ka = &sighand->action[signr-1]; - } - - if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ - continue; - if (ka->sa.sa_handler != SIG_DFL) { - /* Run the handler. */ - *return_ka = *ka; - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; - - break; /* will return non-zero "signr" value */ - } - - /* - * Now we are doing the default action for this signal. - */ - if (sig_kernel_ignore(signr)) /* Default is nothing. */ - continue; - - /* - * Global init gets no signals it doesn't want. - */ - if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && - !signal_group_exit(signal)) - continue; - - if (sig_kernel_stop(signr)) { - /* - * The default action is to stop all threads in - * the thread group. The job control signals - * do nothing in an orphaned pgrp, but SIGSTOP - * always works. Note that siglock needs to be - * dropped during the call to is_orphaned_pgrp() - * because of lock ordering with tasklist_lock. - * This allows an intervening SIGCONT to be posted. - * We need to check for that and bail out if necessary. - */ - if (signr != SIGSTOP) { - spin_unlock_irq(&sighand->siglock); - - /* signals can be posted during this window */ - - if (is_current_pgrp_orphaned()) - goto relock; - - spin_lock_irq(&sighand->siglock); - } - - if (likely(do_signal_stop(info->si_signo))) { - /* It released the siglock. */ - goto relock; - } - - /* - * We didn't actually stop, due to a race - * with SIGCONT or something like that. - */ - continue; - } - - spin_unlock_irq(&sighand->siglock); - - /* - * Anything else is fatal, maybe with a core dump. - */ - current->flags |= PF_SIGNALED; - - if (sig_kernel_coredump(signr)) { - if (print_fatal_signals) - print_fatal_signal(regs, info->si_signo); - /* - * If it was able to dump core, this kills all - * other threads in the group and synchronizes with - * their demise. If we lost the race with another - * thread getting here, it set group_exit_code - * first and our do_group_exit call below will use - * that value and ignore the one we pass it. - */ - do_coredump(info->si_signo, info->si_signo, regs); - } - - /* - * Death signals, no core dump. - */ - do_group_exit(info->si_signo); - /* NOTREACHED */ - } - spin_unlock_irq(&sighand->siglock); - return signr; -} - -void exit_signals(struct task_struct *tsk) -{ - int group_stop = 0; - struct task_struct *t; - - if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { - tsk->flags |= PF_EXITING; - return; - } - - spin_lock_irq(&tsk->sighand->siglock); - /* - * From now this task is not visible for group-wide signals, - * see wants_signal(), do_signal_stop(). - */ - tsk->flags |= PF_EXITING; - if (!signal_pending(tsk)) - goto out; - - /* It could be that __group_complete_signal() choose us to - * notify about group-wide signal. Another thread should be - * woken now to take the signal since we will not. - */ - for (t = tsk; (t = next_thread(t)) != tsk; ) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); - - if (unlikely(tsk->signal->group_stop_count) && - !--tsk->signal->group_stop_count) { - tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; - } -out: - spin_unlock_irq(&tsk->sighand->siglock); - - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); - read_unlock(&tasklist_lock); - } -} - -EXPORT_SYMBOL(recalc_sigpending); -EXPORT_SYMBOL_GPL(dequeue_signal); -EXPORT_SYMBOL(flush_signals); -EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(send_sig); -EXPORT_SYMBOL(send_sig_info); -EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - - -/* - * System call entry points. - */ - -SYSCALL_DEFINE0(restart_syscall) -{ - struct restart_block *restart = ¤t_thread_info()->restart_block; - return restart->fn(restart); -} - -long do_no_restart_syscall(struct restart_block *param) -{ - return -EINTR; -} - -/* - * We don't need to get the kernel lock - this is all local to this - * particular thread.. (and that's good, because this is _heavily_ - * used by various programs) - */ - -/* - * This is also useful for kernel threads that want to temporarily - * (or permanently) block certain signals. - * - * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel - * interface happily blocks "unblockable" signals like SIGKILL - * and friends. - */ -int sigprocmask(int how, sigset_t *set, sigset_t *oldset) -{ - int error; - - spin_lock_irq(¤t->sighand->siglock); - if (oldset) - *oldset = current->blocked; - - error = 0; - switch (how) { - case SIG_BLOCK: - sigorsets(¤t->blocked, ¤t->blocked, set); - break; - case SIG_UNBLOCK: - signandsets(¤t->blocked, ¤t->blocked, set); - break; - case SIG_SETMASK: - current->blocked = *set; - break; - default: - error = -EINVAL; - } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - return error; -} - -SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, - sigset_t __user *, oset, size_t, sigsetsize) -{ - int error = -EINVAL; - sigset_t old_set, new_set; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - goto out; - - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; - sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - error = sigprocmask(how, &new_set, &old_set); - if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked; - spin_unlock_irq(¤t->sighand->siglock); - - set_old: - error = -EFAULT; - if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; - } - error = 0; -out: - return error; -} - -long do_sigpending(void __user *set, unsigned long sigsetsize) -{ - long error = -EINVAL; - sigset_t pending; - - if (sigsetsize > sizeof(sigset_t)) - goto out; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(&pending, ¤t->pending.signal, - ¤t->signal->shared_pending.signal); - spin_unlock_irq(¤t->sighand->siglock); - - /* Outside the lock because only this thread touches it. */ - sigandsets(&pending, ¤t->blocked, &pending); - - error = -EFAULT; - if (!copy_to_user(set, &pending, sigsetsize)) - error = 0; - -out: - return error; -} - -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) -{ - return do_sigpending(set, sigsetsize); -} - -#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER - -int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) -{ - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) - return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)) - ? -EFAULT : 0; - /* - * If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * Please remember to update the signalfd_copyinfo() function - * inside fs/signalfd.c too, in case siginfo_t changes. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code & __SI_MASK) { - case __SI_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case __SI_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case __SI_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case __SI_FAULT: - err |= __put_user(from->si_addr, &to->si_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(from->si_trapno, &to->si_trapno); -#endif - break; - case __SI_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - } - return err; -} - -#endif - -SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, - size_t, sigsetsize) -{ - int ret, sig; - sigset_t these; - struct timespec ts; - siginfo_t info; - long timeout = 0; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&these, uthese, sizeof(these))) - return -EFAULT; - - /* - * Invert the set of allowed signals to get those we - * want to block. - */ - sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&these); - - if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) - return -EFAULT; - if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 - || ts.tv_sec < 0) - return -EINVAL; - } - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = (timespec_to_jiffies(&ts) - + (ts.tv_sec || ts.tv_nsec)); - - if (timeout) { - /* None ready -- temporarily unblock those we're - * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); - - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user(uinfo, &info)) - ret = -EFAULT; - } - } else { - ret = -EAGAIN; - if (timeout) - ret = -EINTR; - } - - return ret; -} - -SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) -{ - struct siginfo info; - - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; - - return kill_something_info(sig, &info, pid); -} - -static int do_tkill(pid_t tgid, pid_t pid, int sig) -{ - int error; - struct siginfo info; - struct task_struct *p; - unsigned long flags; - - error = -ESRCH; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current->uid; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, &info, p); - /* - * The null signal is a permissions and process existence - * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. - */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); - unlock_task_sighand(p, &flags); - } - } - rcu_read_unlock(); - - return error; -} - -/** - * sys_tgkill - send signal to one specific thread - * @tgid: the thread group ID of the thread - * @pid: the PID of the thread - * @sig: signal to be sent - * - * This syscall also checks the @tgid and returns -ESRCH even if the PID - * exists but it's not belonging to the target process anymore. This - * method solves the problem of threads exiting and PIDs getting reused. - */ -SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) -{ - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - - return do_tkill(tgid, pid, sig); -} - -/* - * Send a signal to only one task, even if it's a CLONE_THREAD task. - */ -SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) -{ - /* This is only valid for single tasks */ - if (pid <= 0) - return -EINVAL; - - return do_tkill(0, pid, sig); -} - -SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, - siginfo_t __user *, uinfo) -{ - siginfo_t info; - - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) - return -EFAULT; - - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info.si_code >= 0 || info.si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info.si_code < 0); - return -EPERM; - } - info.si_signo = sig; - - /* POSIX.1b doesn't mention process groups. */ - return kill_proc_info(sig, &info, pid); -} - -int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) -{ - struct task_struct *t = current; - struct k_sigaction *k; - sigset_t mask; - - if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) - return -EINVAL; - - k = &t->sighand->action[sig-1]; - - spin_lock_irq(¤t->sighand->siglock); - if (oact) - *oact = *k; - - if (act) { - sigdelsetmask(&act->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); - *k = *act; - /* - * POSIX 3.3.1.3: - * "Setting a signal action to SIG_IGN for a signal that is - * pending shall cause the pending signal to be discarded, - * whether or not it is blocked." - * - * "Setting a signal action to SIG_DFL for a signal that is - * pending and whose default action is to ignore the signal - * (for example, SIGCHLD), shall cause the pending signal to - * be discarded, whether or not it is blocked" - */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { - sigemptyset(&mask); - sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); - do { - rm_from_queue_full(&mask, &t->pending); - t = next_thread(t); - } while (t != current); - } - } - - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -int -do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) -{ - stack_t oss; - int error; - - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - - if (uss) { - void __user *ss_sp; - size_t ss_size; - int ss_flags; - - error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) - || __get_user(ss_sp, &uss->ss_sp) - || __get_user(ss_flags, &uss->ss_flags) - || __get_user(ss_size, &uss->ss_size)) - goto out; - - error = -EPERM; - if (on_sig_stack(sp)) - goto out; - - error = -EINVAL; - /* - * - * Note - this code used to test ss_flags incorrectly - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) - goto out; - - if (ss_flags == SS_DISABLE) { - ss_size = 0; - ss_sp = NULL; - } else { - error = -ENOMEM; - if (ss_size < MINSIGSTKSZ) - goto out; - } - - current->sas_ss_sp = (unsigned long) ss_sp; - current->sas_ss_size = ss_size; - } - - error = 0; - if (uoss) { - error = -EFAULT; - if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) - goto out; - error = __put_user(oss.ss_sp, &uoss->ss_sp) | - __put_user(oss.ss_size, &uoss->ss_size) | - __put_user(oss.ss_flags, &uoss->ss_flags); - } - -out: - return error; -} - -#ifdef __ARCH_WANT_SYS_SIGPENDING - -SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) -{ - return do_sigpending(set, sizeof(*set)); -} - -#endif - -#ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ - -SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, - old_sigset_t __user *, oset) -{ - int error; - old_sigset_t old_set, new_set; - - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked.sig[0]; - - error = 0; - switch (how) { - default: - error = -EINVAL; - break; - case SIG_BLOCK: - sigaddsetmask(¤t->blocked, new_set); - break; - case SIG_UNBLOCK: - sigdelsetmask(¤t->blocked, new_set); - break; - case SIG_SETMASK: - current->blocked.sig[0] = new_set; - break; - } - - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - old_set = current->blocked.sig[0]; - set_old: - error = -EFAULT; - if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; - } - error = 0; -out: - return error; -} -#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ - -#ifdef __ARCH_WANT_SYS_RT_SIGACTION -SYSCALL_DEFINE4(rt_sigaction, int, sig, - const struct sigaction __user *, act, - struct sigaction __user *, oact, - size_t, sigsetsize) -{ - struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - goto out; - - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } - - ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); - - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; -} -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ - -#ifdef __ARCH_WANT_SYS_SGETMASK - -/* - * For backwards compatibility. Functionality superseded by sigprocmask. - */ -SYSCALL_DEFINE0(sgetmask) -{ - /* SMP safe */ - return current->blocked.sig[0]; -} - -SYSCALL_DEFINE1(ssetmask, int, newmask) -{ - int old; - - spin_lock_irq(¤t->sighand->siglock); - old = current->blocked.sig[0]; - - siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| - sigmask(SIGSTOP))); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - return old; -} -#endif /* __ARCH_WANT_SGETMASK */ - -#ifdef __ARCH_WANT_SYS_SIGNAL -/* - * For backwards compatibility. Functionality superseded by sigaction. - */ -SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) -{ - struct k_sigaction new_sa, old_sa; - int ret; - - new_sa.sa.sa_handler = handler; - new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; - sigemptyset(&new_sa.sa.sa_mask); - - ret = do_sigaction(sig, &new_sa, &old_sa); - - return ret ? ret : (unsigned long)old_sa.sa.sa_handler; -} -#endif /* __ARCH_WANT_SYS_SIGNAL */ - -#ifdef __ARCH_WANT_SYS_PAUSE - -SYSCALL_DEFINE0(pause) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - -#endif - -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) -{ - sigset_t newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; -} -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ - -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -void __init signals_init(void) -{ - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); -} -/* - * Generic helpers for smp ipi calls - * - * (C) Jens Axboe 2008 - * - */ -#include -#include -#include -#include -#include -#include - -static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); -static LIST_HEAD(call_function_queue); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); - -enum { - CSD_FLAG_WAIT = 0x01, - CSD_FLAG_ALLOC = 0x02, -}; - -struct call_function_data { - struct call_single_data csd; - spinlock_t lock; - unsigned int refs; - cpumask_t cpumask; - struct rcu_head rcu_head; -}; - -struct call_single_queue { - struct list_head list; - spinlock_t lock; -}; - -static int __cpuinit init_call_single_data(void) -{ - int i; - - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } - return 0; -} -early_initcall(init_call_single_data); - -static void csd_flag_wait(struct call_single_data *data) -{ - /* Wait for response */ - do { - /* - * We need to see the flags store in the IPI handler - */ - smp_mb(); - if (!(data->flags & CSD_FLAG_WAIT)) - break; - cpu_relax(); - } while (1); -} - -/* - * Insert a previously allocated call_single_data element for execution - * on the given CPU. data must already have ->func, ->info, and ->flags set. - */ -static void generic_exec_single(int cpu, struct call_single_data *data) -{ - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - int wait = data->flags & CSD_FLAG_WAIT, ipi; - unsigned long flags; - - spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); - - if (ipi) - arch_send_call_function_single_ipi(cpu); - - if (wait) - csd_flag_wait(data); -} - -static void rcu_free_call_data(struct rcu_head *head) -{ - struct call_function_data *data; - - data = container_of(head, struct call_function_data, rcu_head); - - kfree(data); -} - -/* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = get_cpu(); - - /* - * It's ok to use list_for_each_rcu() here even though we may delete - * 'pos', since list_del_rcu() doesn't clear ->next - */ - rcu_read_lock(); - list_for_each_entry_rcu(data, &call_function_queue, csd.list) { - int refs; - - if (!cpu_isset(cpu, data->cpumask)) - continue; - - data->csd.func(data->csd.info); - - spin_lock(&data->lock); - cpu_clear(cpu, data->cpumask); - WARN_ON(data->refs == 0); - data->refs--; - refs = data->refs; - spin_unlock(&data->lock); - - if (refs) - continue; - - spin_lock(&call_function_lock); - list_del_rcu(&data->csd.list); - spin_unlock(&call_function_lock); - - if (data->csd.flags & CSD_FLAG_WAIT) { - /* - * serialize stores to data with the flag clear - * and wakeup - */ - smp_wmb(); - data->csd.flags &= ~CSD_FLAG_WAIT; - } - if (data->csd.flags & CSD_FLAG_ALLOC) - call_rcu(&data->rcu_head, rcu_free_call_data); - } - rcu_read_unlock(); - - put_cpu(); -} - -/* - * Invoked by arch to handle an IPI for call function single. Must be called - * from the arch with interrupts disabled. - */ -void generic_smp_call_function_single_interrupt(void) -{ - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); - - /* - * Need to see other stores to list head for checking whether - * list is empty without holding q->lock - */ - smp_mb(); - while (!list_empty(&q->list)) { - unsigned int data_flags; - - spin_lock(&q->lock); - list_replace_init(&q->list, &list); - spin_unlock(&q->lock); - - while (!list_empty(&list)) { - struct call_single_data *data; - - data = list_entry(list.next, struct call_single_data, - list); - list_del(&data->list); - - /* - * 'data' can be invalid after this call if - * flags == 0 (when called through - * generic_exec_single(), so save them away before - * making the call. - */ - data_flags = data->flags; - - data->func(data->info); - - if (data_flags & CSD_FLAG_WAIT) { - smp_wmb(); - data->flags &= ~CSD_FLAG_WAIT; - } else if (data_flags & CSD_FLAG_ALLOC) - kfree(data); - } - /* - * See comment on outer loop - */ - smp_mb(); - } -} - -/* - * smp_call_function_single - Run a function on a specific CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int wait) -{ - struct call_single_data d; - unsigned long flags; - /* prevent preemption and reschedule on another processor, - as well as CPU removal */ - int me = get_cpu(); - int err = 0; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - if (cpu == me) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { - struct call_single_data *data = NULL; - - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->flags = CSD_FLAG_ALLOC; - } - if (!data) { - data = &d; - data->flags = CSD_FLAG_WAIT; - } - - data->func = func; - data->info = info; - generic_exec_single(cpu, data); - } else { - err = -ENXIO; /* CPU not online */ - } - - put_cpu(); - return err; -} -EXPORT_SYMBOL(smp_call_function_single); - -/** - * __smp_call_function_single(): Run a function on another CPU - * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure - * - * Like smp_call_function_single(), but allow caller to pass in a pre-allocated - * data structure. Useful for embedding @data inside other structures, for - * instance. - * - */ -void __smp_call_function_single(int cpu, struct call_single_data *data) -{ - /* Can deadlock when called with interrupts disabled */ - WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); - - generic_exec_single(cpu, data); -} - -/* Dummy function */ -static void quiesce_dummy(void *unused) -{ -} - -/* - * Ensure stack based data used in call function mask is safe to free. - * - * This is needed by smp_call_function_mask when using on-stack data, because - * a single call function queue is shared by all CPUs, and any CPU may pick up - * the data item on the queue at any time before it is deleted. So we need to - * ensure that all CPUs have transitioned through a quiescent state after - * this call. - * - * This is a very slow function, implemented by sending synchronous IPIs to - * all possible CPUs. For this reason, we have to alloc data rather than use - * stack based data even in the case of synchronous calls. The stack based - * data is then just used for deadlock/oom fallback which will be very rare. - * - * If a faster scheme can be made, we could go back to preferring stack based - * data -- the data allocation/free is non-zero cost. - */ -static void smp_call_function_mask_quiesce_stack(cpumask_t mask) -{ - struct call_single_data data; - int cpu; - - data.func = quiesce_dummy; - data.info = NULL; - - for_each_cpu_mask(cpu, mask) { - data.flags = CSD_FLAG_WAIT; - generic_exec_single(cpu, &data); - } -} - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. Preemption - * must be disabled when calling this function. - */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) -{ - struct call_function_data d; - struct call_function_data *data = NULL; - cpumask_t allbutself; - unsigned long flags; - int cpu, num_cpus; - int slowpath = 0; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - cpu = smp_processor_id(); - allbutself = cpu_online_map; - cpu_clear(cpu, allbutself); - cpus_and(mask, mask, allbutself); - num_cpus = cpus_weight(mask); - - /* - * If zero CPUs, return. If just a single CPU, turn this request - * into a targetted single call instead since it's faster. - */ - if (!num_cpus) - return 0; - else if (num_cpus == 1) { - cpu = first_cpu(mask); - return smp_call_function_single(cpu, func, info, wait); - } - - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) { - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; - } else { - data = &d; - data->csd.flags = CSD_FLAG_WAIT; - wait = 1; - slowpath = 1; - } - - spin_lock_init(&data->lock); - data->csd.func = func; - data->csd.info = info; - data->refs = num_cpus; - data->cpumask = mask; - - spin_lock_irqsave(&call_function_lock, flags); - list_add_tail_rcu(&data->csd.list, &call_function_queue); - spin_unlock_irqrestore(&call_function_lock, flags); - - /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi(mask); - - /* optionally wait for the CPUs to complete */ - if (wait) { - csd_flag_wait(&data->csd); - if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(mask); - } - - return 0; -} -EXPORT_SYMBOL(smp_call_function_mask); - -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. In case of allocation - * failure, @wait will be implicitly turned on. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func)(void *), void *info, int wait) -{ - int ret; - - preempt_disable(); - ret = smp_call_function_mask(cpu_online_map, func, info, wait); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - spin_lock(&call_function_lock); -} - -void ipi_call_unlock(void) -{ - spin_unlock(&call_function_lock); -} - -void ipi_call_lock_irq(void) -{ - spin_lock_irq(&call_function_lock); -} - -void ipi_call_unlock_irq(void) -{ - spin_unlock_irq(&call_function_lock); -} -/* - * linux/kernel/softirq.c - * - * Copyright (C) 1992 Linus Torvalds - * - * Distribute under GPLv2. - * - * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -/* - - No shared variables, all the data are CPU local. - - If a softirq needs serialization, let it serialize itself - by its own spinlocks. - - Even if softirq is serialized, only local cpu is marked for - execution. Hence, we get something sort of weak cpu binding. - Though it is still not clear, will it result in better locality - or will not. - - Examples: - - NET RX softirq. It is multithreaded and does not require - any global serialization. - - NET TX softirq. It kicks software netdevice queues, hence - it is logically serialized per device, but this serialization - is invisible to common code. - - Tasklets: serialized wrt itself. - */ - -#ifndef __ARCH_IRQ_STAT -irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; -EXPORT_SYMBOL(irq_stat); -#endif - -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; - -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); - -/* - * we cannot loop indefinitely here to avoid userspace starvation, - * but we also don't want to introduce a worst case 1/HZ latency - * to the pending events, so lets the scheduler to balance - * the softirq load for us. - */ -static inline void wakeup_softirqd(void) -{ - /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); - - if (tsk && tsk->state != TASK_RUNNING) - wake_up_process(tsk); -} - -/* - * This one is for softirq.c-internal use, - * where hardirqs are disabled legitimately: - */ -#ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip) -{ - unsigned long flags; - - WARN_ON_ONCE(in_irq()); - - raw_local_irq_save(flags); - add_preempt_count(SOFTIRQ_OFFSET); - /* - * Were softirqs turned off above: - */ - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_off(ip); - raw_local_irq_restore(flags); -} -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip) -{ - add_preempt_count(SOFTIRQ_OFFSET); - barrier(); -} -#endif /* CONFIG_TRACE_IRQFLAGS */ - -void local_bh_disable(void) -{ - __local_bh_disable((unsigned long)__builtin_return_address(0)); -} - -EXPORT_SYMBOL(local_bh_disable); - -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - -/* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), - * without processing still-pending softirqs: - */ -void _local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(SOFTIRQ_OFFSET); -} - -EXPORT_SYMBOL(_local_bh_enable); - -static inline void _local_bh_enable_ip(unsigned long ip) -{ - WARN_ON_ONCE(in_irq() || irqs_disabled()); -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_disable(); -#endif - /* - * Are softirqs going to be turned on now: - */ - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on(ip); - /* - * Keep preemption disabled until we are done with - * softirq processing: - */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); - - if (unlikely(!in_interrupt() && local_softirq_pending())) - do_softirq(); - - dec_preempt_count(); -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_enable(); -#endif - preempt_check_resched(); -} - -void local_bh_enable(void) -{ - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); - -/* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. - * - * This number has been established via experimentation. - * The two things to balance is latency against fairness - - * we want to handle softirqs as soon as possible, but they - * should not be able to lock up the box. - */ -#define MAX_SOFTIRQ_RESTART 10 - -asmlinkage void __do_softirq(void) -{ - struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; - - pending = local_softirq_pending(); - account_system_vtime(current); - - __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); - - cpu = smp_processor_id(); -restart: - /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); - - local_irq_enable(); - - h = softirq_vec; - - do { - if (pending & 1) { - h->action(h); - rcu_bh_qsctr_inc(cpu); - } - h++; - pending >>= 1; - } while (pending); - - local_irq_disable(); - - pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; - - if (pending) - wakeup_softirqd(); - - trace_softirq_exit(); - - account_system_vtime(current); - _local_bh_enable(); -} - -#ifndef __ARCH_HAS_DO_SOFTIRQ - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending) - __do_softirq(); - - local_irq_restore(flags); -} - -#endif - -/* - * Enter an interrupt context. - */ -void irq_enter(void) -{ -#ifdef CONFIG_NO_HZ - int cpu = smp_processor_id(); - if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif - __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif -} - -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() -#else -# define invoke_softirq() do_softirq() -#endif - -/* - * Exit an interrupt context. Process softirqs if needed and possible: - */ -void irq_exit(void) -{ - account_system_vtime(current); - trace_hardirq_exit(); - sub_preempt_count(IRQ_EXIT_OFFSET); - if (!in_interrupt() && local_softirq_pending()) - invoke_softirq(); - -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(0); - rcu_irq_exit(); -#endif - preempt_enable_no_resched(); -} - -/* - * This function must run with irqs disabled! - */ -inline void raise_softirq_irqoff(unsigned int nr) -{ - __raise_softirq_irqoff(nr); - - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); -} - -void raise_softirq(unsigned int nr) -{ - unsigned long flags; - - local_irq_save(flags); - raise_softirq_irqoff(nr); - local_irq_restore(flags); -} - -void open_softirq(int nr, void (*action)(struct softirq_action *)) -{ - softirq_vec[nr].action = action; -} - -/* Tasklets */ -struct tasklet_head -{ - struct tasklet_struct *head; - struct tasklet_struct **tail; -}; - -static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); -static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); - -void __tasklet_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__tasklet_schedule); - -void __tasklet_hi_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__tasklet_hi_schedule); - -static void tasklet_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); - } -} - -static void tasklet_hi_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).head; - __get_cpu_var(tasklet_hi_vec).head = NULL; - __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } -} - - -void tasklet_init(struct tasklet_struct *t, - void (*func)(unsigned long), unsigned long data) -{ - t->next = NULL; - t->state = 0; - atomic_set(&t->count, 0); - t->func = func; - t->data = data; -} - -EXPORT_SYMBOL(tasklet_init); - -void tasklet_kill(struct tasklet_struct *t) -{ - if (in_interrupt()) - printk("Attempt to kill tasklet from interrupt\n"); - - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do - yield(); - while (test_bit(TASKLET_STATE_SCHED, &t->state)); - } - tasklet_unlock_wait(t); - clear_bit(TASKLET_STATE_SCHED, &t->state); -} - -EXPORT_SYMBOL(tasklet_kill); - -void __init softirq_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - per_cpu(tasklet_vec, cpu).tail = - &per_cpu(tasklet_vec, cpu).head; - per_cpu(tasklet_hi_vec, cpu).tail = - &per_cpu(tasklet_hi_vec, cpu).head; - } - - open_softirq(TASKLET_SOFTIRQ, tasklet_action); - open_softirq(HI_SOFTIRQ, tasklet_hi_action); -} - -static int ksoftirqd(void * __bind_cpu) -{ - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - do_softirq(); - preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; - -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -/* - * tasklet_kill_immediate is called to remove a tasklet which can already be - * scheduled for execution on @cpu. - * - * Unlike tasklet_kill, this function removes the tasklet - * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. - * - * When this function is called, @cpu must be in the CPU_DEAD state. - */ -void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) -{ - struct tasklet_struct **i; - - BUG_ON(cpu_online(cpu)); - BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); - - if (!test_bit(TASKLET_STATE_SCHED, &t->state)) - return; - - /* CPU is dead, so no lock needed. */ - for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { - if (*i == t) { - *i = t->next; - /* If this was the tail element, move the tail ptr */ - if (*i == NULL) - per_cpu(tasklet_vec, cpu).tail = i; - return; - } - } - BUG(); -} - -static void takeover_tasklets(unsigned int cpu) -{ - /* CPU is dead, so no lock needed. */ - local_irq_disable(); - - /* Find end, append list for that CPU. */ - if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { - *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; - __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; - per_cpu(tasklet_vec, cpu).head = NULL; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; - } - raise_softirq_irqoff(TASKLET_SOFTIRQ); - - if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { - *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; - __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; - per_cpu(tasklet_hi_vec, cpu).head = NULL; - per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - } - raise_softirq_irqoff(HI_SOFTIRQ); - - local_irq_enable(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); - takeover_tasklets(hotcpu); - break; - } -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static __init int spawn_ksoftirqd(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err == NOTIFY_BAD); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - return 0; -} -early_initcall(spawn_ksoftirqd); - -#ifdef CONFIG_SMP -/* - * Call a function on all processors - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, wait); - local_irq_disable(); - func(info); - local_irq_enable(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(on_each_cpu); -#endif -/* - * Detect Soft Lockups - * - * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. - * - * this code detects soft lockups: incidents in where on a CPU - * the kernel does not reschedule for 10 seconds or more. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static DEFINE_SPINLOCK(print_lock); - -static DEFINE_PER_CPU(unsigned long, touch_timestamp); -static DEFINE_PER_CPU(unsigned long, print_timestamp); -static DEFINE_PER_CPU(struct task_struct *, watchdog_task); - -static int __read_mostly did_panic; -int __read_mostly softlockup_thresh = 60; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * soft-lockup occurs: - */ -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int -softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = softlock_panic, -}; - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void __touch_softlockup_watchdog(void) -{ - int this_cpu = raw_smp_processor_id(); - - __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); -} - -void touch_softlockup_watchdog(void) -{ - __raw_get_cpu_var(touch_timestamp) = 0; -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* Cause each CPU to re-update its timestamp rather than complain */ - for_each_online_cpu(cpu) - per_cpu(touch_timestamp, cpu) = 0; -} -EXPORT_SYMBOL(touch_all_softlockup_watchdogs); - -/* - * This callback runs from the timer interrupt, and checks - * whether the watchdog thread has hung or not: - */ -void softlockup_tick(void) -{ - int this_cpu = smp_processor_id(); - unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - unsigned long print_timestamp; - struct pt_regs *regs = get_irq_regs(); - unsigned long now; - - /* Is detection switched off? */ - if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { - /* Be sure we don't false trigger if switched back on */ - if (touch_timestamp) - per_cpu(touch_timestamp, this_cpu) = 0; - return; - } - - if (touch_timestamp == 0) { - __touch_softlockup_watchdog(); - return; - } - - print_timestamp = per_cpu(print_timestamp, this_cpu); - - /* report at most once a second */ - if (print_timestamp == touch_timestamp || did_panic) - return; - - /* do not print during early bootup: */ - if (unlikely(system_state != SYSTEM_RUNNING)) { - __touch_softlockup_watchdog(); - return; - } - - now = get_timestamp(this_cpu); - - /* - * Wake up the high-prio watchdog task twice per - * threshold timespan. - */ - if (now > touch_timestamp + softlockup_thresh/2) - wake_up_process(per_cpu(watchdog_task, this_cpu)); - - /* Warn about unreasonable delays: */ - if (now <= (touch_timestamp + softlockup_thresh)) - return; - - per_cpu(print_timestamp, this_cpu) = touch_timestamp; - - spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_timestamp, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - spin_unlock(&print_lock); - - if (softlockup_panic) - panic("softlockup: hung tasks"); -} - -/* - * Have a reasonable limit on the number of tasks checked: - */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; - -/* - * Zero means infinite timeout - no checking done: - */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; - -unsigned long __read_mostly sysctl_hung_task_warnings = 10; - -/* - * Only do the hung-tasks check on one CPU: - */ -static int check_cpu __read_mostly = -1; - -static void check_hung_task(struct task_struct *t, unsigned long now) -{ - unsigned long switch_count = t->nvcsw + t->nivcsw; - - if (t->flags & PF_FROZEN) - return; - - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { - t->last_switch_count = switch_count; - t->last_switch_timestamp = now; - return; - } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) - return; - if (sysctl_hung_task_warnings < 0) - return; - sysctl_hung_task_warnings--; - - /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: - */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - __debug_show_held_locks(t); - - t->last_switch_timestamp = now; - touch_nmi_watchdog(); - - if (softlockup_panic) - panic("softlockup: blocked tasks"); -} - -/* - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. - */ -static void check_hung_uninterruptible_tasks(int this_cpu) -{ - int max_count = sysctl_hung_task_check_count; - unsigned long now = get_timestamp(this_cpu); - struct task_struct *g, *t; - - /* - * If the system crashed already then all bets are off, - * do not report extra hung tasks: - */ - if ((tainted & TAINT_DIE) || did_panic) - return; - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (!--max_count) - goto unlock; - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); - } while_each_thread(g, t); - unlock: - read_unlock(&tasklist_lock); -} - -/* - * The watchdog thread - runs every second and touches the timestamp. - */ -static int watchdog(void *__bind_cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - int this_cpu = (long)__bind_cpu; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_softlockup_watchdog(); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). - */ - while (!kthread_should_stop()) { - __touch_softlockup_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - if (this_cpu == check_cpu) { - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); - } - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(watchdog_task, hotcpu)); - p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); - if (IS_ERR(p)) { - printk(KERN_ERR "watchdog for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - per_cpu(touch_timestamp, hotcpu) = 0; - per_cpu(watchdog_task, hotcpu) = p; - kthread_bind(p, hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - check_cpu = any_online_cpu(cpu_online_map); - wake_up_process(per_cpu(watchdog_task, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - if (hotcpu == check_cpu) { - cpumask_t temp_cpu_online_map = cpu_online_map; - - cpu_clear(hotcpu, temp_cpu_online_map); - check_cpu = any_online_cpu(temp_cpu_online_map); - } - break; - - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(watchdog_task, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(watchdog_task, hotcpu), - any_online_cpu(cpu_online_map)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - p = per_cpu(watchdog_task, hotcpu); - per_cpu(watchdog_task, hotcpu) = NULL; - kthread_stop(p); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static int __init spawn_softlockup_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nosoftlockup) - return 0; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - - return 0; -} -early_initcall(spawn_softlockup_task); -/* - * Copyright (2004) Linus Torvalds - * - * Author: Zwane Mwaikambo - * - * Copyright (2004, 2005) Ingo Molnar - * - * This file contains the spinlock/rwlock implementations for the - * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) - * - * Note that some architectures have special knowledge about the - * stack frames of these functions in their profile_pc. If you - * change anything significant here that could change the stack - * frame contact the architecture maintainers. - */ - -#include -#include -#include -#include -#include -#include - -int __lockfunc _spin_trylock(spinlock_t *lock) -{ - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; -} -EXPORT_SYMBOL(_spin_trylock); - -int __lockfunc _read_trylock(rwlock_t *lock) -{ - preempt_disable(); - if (_raw_read_trylock(lock)) { - rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; -} -EXPORT_SYMBOL(_read_trylock); - -int __lockfunc _write_trylock(rwlock_t *lock) -{ - preempt_disable(); - if (_raw_write_trylock(lock)) { - rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable(); - return 0; -} -EXPORT_SYMBOL(_write_trylock); - -/* - * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are - * not re-enabled during lock-acquire (which the preempt-spin-ops do): - */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) - -void __lockfunc _read_lock(rwlock_t *lock) -{ - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock); - -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -#else - _raw_spin_lock_flags(lock, &flags); -#endif - return flags; -} -EXPORT_SYMBOL(_spin_lock_irqsave); - -void __lockfunc _spin_lock_irq(spinlock_t *lock) -{ - local_irq_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_irq); - -void __lockfunc _spin_lock_bh(spinlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_bh); - -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); - return flags; -} -EXPORT_SYMBOL(_read_lock_irqsave); - -void __lockfunc _read_lock_irq(rwlock_t *lock) -{ - local_irq_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock_irq); - -void __lockfunc _read_lock_bh(rwlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); -} -EXPORT_SYMBOL(_read_lock_bh); - -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); - return flags; -} -EXPORT_SYMBOL(_write_lock_irqsave); - -void __lockfunc _write_lock_irq(rwlock_t *lock) -{ - local_irq_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} -EXPORT_SYMBOL(_write_lock_irq); - -void __lockfunc _write_lock_bh(rwlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} -EXPORT_SYMBOL(_write_lock_bh); - -void __lockfunc _spin_lock(spinlock_t *lock) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} - -EXPORT_SYMBOL(_spin_lock); - -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); -} - -EXPORT_SYMBOL(_write_lock); - -#else /* CONFIG_PREEMPT: */ - -/* - * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal - * towards that other CPU that it should break the lock ASAP. - * - * (We do this in a function because inlining it would be excessive.) - */ - -#define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ -{ \ - for (;;) { \ - preempt_disable(); \ - if (likely(_raw_##op##_trylock(lock))) \ - break; \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock); \ - \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - for (;;) { \ - preempt_disable(); \ - local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ - break; \ - local_irq_restore(flags); \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ - return flags; \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ - \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ -{ \ - _##op##_lock_irqsave(lock); \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_irq); \ - \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - /* */ \ - /* Careful: we must exclude softirqs too, hence the */ \ - /* irq-disabling. We use the generic preemption-aware */ \ - /* function: */ \ - /**/ \ - flags = _##op##_lock_irqsave(lock); \ - local_bh_disable(); \ - local_irq_restore(flags); \ -} \ - \ -EXPORT_SYMBOL(_##op##_lock_bh) - -/* - * Build preemption-friendly versions of the following - * lock-spinning functions: - * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() - */ -BUILD_LOCK_OPS(spin, spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); - -#endif /* CONFIG_PREEMPT */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_nested); - -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -#else - _raw_spin_lock_flags(lock, &flags); -#endif - return flags; -} -EXPORT_SYMBOL(_spin_lock_irqsave_nested); - -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, - struct lockdep_map *nest_lock) -{ - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); -} -EXPORT_SYMBOL(_spin_lock_nest_lock); - -#endif - -void __lockfunc _spin_unlock(spinlock_t *lock) -{ - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock); - -void __lockfunc _write_unlock(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock); - -void __lockfunc _read_unlock(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock); - -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) -{ - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock_irqrestore); - -void __lockfunc _spin_unlock_irq(spinlock_t *lock) -{ - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - local_irq_enable(); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock_irq); - -void __lockfunc _spin_unlock_bh(spinlock_t *lock) -{ - spin_release(&lock->dep_map, 1, _RET_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(_spin_unlock_bh); - -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock_irqrestore); - -void __lockfunc _read_unlock_irq(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - local_irq_enable(); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock_irq); - -void __lockfunc _read_unlock_bh(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(_read_unlock_bh); - -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock_irqrestore); - -void __lockfunc _write_unlock_irq(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - local_irq_enable(); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock_irq); - -void __lockfunc _write_unlock_bh(rwlock_t *lock) -{ - rwlock_release(&lock->dep_map, 1, _RET_IP_); - _raw_write_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(_write_unlock_bh); - -int __lockfunc _spin_trylock_bh(spinlock_t *lock) -{ - local_bh_disable(); - preempt_disable(); - if (_raw_spin_trylock(lock)) { - spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return 1; - } - - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); - return 0; -} -EXPORT_SYMBOL(_spin_trylock_bh); - -notrace int in_lock_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __lockfunc functions */ - extern char __lock_text_start[], __lock_text_end[]; - - return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; -} -EXPORT_SYMBOL(in_lock_functions); -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Paul McKenney - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - sp->completed = 0; - mutex_init(&sp->mutex); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); - return (sp->per_cpu_ref ? 0 : -ENOMEM); -} - -/* - * srcu_readers_active_idx -- returns approximate number of readers - * active on the specified rank of per-CPU counters. - */ - -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - int sum; - - sum = 0; - for_each_possible_cpu(cpu) - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; - return sum; -} - -/** - * srcu_readers_active - returns approximate number of readers. - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static int srcu_readers_active(struct srcu_struct *sp) -{ - return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - int sum; - - sum = srcu_readers_active(sp); - WARN_ON(sum); /* Leakage unless caller handles error. */ - if (sum != 0) - return; - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} - -/** - * srcu_read_lock - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. - * - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - preempt_disable(); - idx = sp->completed & 0x1; - barrier(); /* ensure compiler looks -once- at sp->completed. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - preempt_enable(); - return idx; -} - -/** - * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). - * - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. - */ -void srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - preempt_disable(); - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; - preempt_enable(); -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchornize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - int idx; - - idx = sp->completed; - mutex_lock(&sp->mutex); - - /* - * Check to see if someone else did the work for us while we were - * waiting to acquire the lock. We need -two- advances of - * the counter, not just one. If there was but one, we might have - * shown up -after- our helper's first synchronize_sched(), thus - * having failed to prevent CPU-reordering races with concurrent - * srcu_read_unlock()s on other CPUs (see comment below). So we - * either (1) wait for two or (2) supply the second ourselves. - */ - - if ((sp->completed - idx) >= 2) { - mutex_unlock(&sp->mutex); - return; - } - - synchronize_sched(); /* Force memory barrier on all CPUs. */ - - /* - * The preceding synchronize_sched() ensures that any CPU that - * sees the new value of sp->completed will also see any preceding - * changes to data structures made by this CPU. This prevents - * some other CPU from reordering the accesses in its SRCU - * read-side critical section to precede the corresponding - * srcu_read_lock() -- ensuring that such references will in - * fact be protected. - * - * So it is now safe to do the flip. - */ - - idx = sp->completed & 0x1; - sp->completed++; - - synchronize_sched(); /* Force memory barrier on all CPUs. */ - - /* - * At this point, because of the preceding synchronize_sched(), - * all srcu_read_lock() calls using the old counters have completed. - * Their corresponding critical sections might well be still - * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. - */ - - while (srcu_readers_active_idx(sp, idx)) - schedule_timeout_interruptible(1); - - synchronize_sched(); /* Force memory barrier on all CPUs. */ - - /* - * The preceding synchronize_sched() forces all srcu_read_unlock() - * primitives that were executing concurrently with the preceding - * for_each_possible_cpu() loop to have completed by this point. - * More importantly, it also forces the corresponding SRCU read-side - * critical sections to have also completed, and the corresponding - * references to SRCU-protected data items to be dropped. - * - * Note: - * - * Despite what you might think at first glance, the - * preceding synchronize_sched() -must- be within the - * critical section ended by the following mutex_unlock(). - * Otherwise, a task taking the early exit can race - * with a srcu_read_unlock(), which might have executed - * just before the preceding srcu_readers_active() check, - * and whose CPU might have reordered the srcu_read_unlock() - * with the preceding critical section. In this case, there - * is nothing preventing the synchronize_sched() task that is - * taking the early exit from freeing a data structure that - * is still being referenced (out of order) by the task - * doing the srcu_read_unlock(). - * - * Alternatively, the comparison with "2" on the early exit - * could be changed to "3", but this increases synchronize_srcu() - * latency for bulk loads. So the current code is preferred. - */ - - mutex_unlock(&sp->mutex); -} - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ - -long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} - -EXPORT_SYMBOL_GPL(init_srcu_struct); -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -EXPORT_SYMBOL_GPL(srcu_read_lock); -EXPORT_SYMBOL_GPL(srcu_read_unlock); -EXPORT_SYMBOL_GPL(synchronize_srcu); -EXPORT_SYMBOL_GPL(srcu_batches_completed); -/* - * kernel/stacktrace.c - * - * Stack trace management functions - * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - */ -#include -#include -#include -#include - -void print_stack_trace(struct stack_trace *trace, int spaces) -{ - int i; - - if (WARN_ON(!trace->entries)) - return; - - for (i = 0; i < trace->nr_entries; i++) { - printk("%*c", 1 + spaces, ' '); - print_ip_sym(trace->entries[i]); - } -} -EXPORT_SYMBOL_GPL(print_stack_trace); - -/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. - * GPL v2 and any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* This controls the threads on each CPU. */ -enum stopmachine_state { - /* Dummy starting state for thread. */ - STOPMACHINE_NONE, - /* Awaiting everyone to be scheduled. */ - STOPMACHINE_PREPARE, - /* Disable interrupts. */ - STOPMACHINE_DISABLE_IRQ, - /* Run the function */ - STOPMACHINE_RUN, - /* Exit */ - STOPMACHINE_EXIT, -}; -static enum stopmachine_state state; - -struct stop_machine_data { - int (*fn)(void *); - void *data; - int fnret; -}; - -/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ -static unsigned int num_threads; -static atomic_t thread_ack; -static struct completion finished; -static DEFINE_MUTEX(lock); - -static void set_state(enum stopmachine_state newstate) -{ - /* Reset ack counter. */ - atomic_set(&thread_ack, num_threads); - smp_wmb(); - state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(void) -{ - if (atomic_dec_and_test(&thread_ack)) { - /* If we're the last one to ack the EXIT, we're finished. */ - if (state == STOPMACHINE_EXIT) - complete(&finished); - else - set_state(state + 1); - } -} - -/* This is the actual thread which stops the CPU. It exits by itself rather - * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ -static int stop_cpu(struct stop_machine_data *smdata) -{ - enum stopmachine_state curstate = STOPMACHINE_NONE; - - /* Simple state machine */ - do { - /* Chill out and ensure we re-read stopmachine_state. */ - cpu_relax(); - if (state != curstate) { - curstate = state; - switch (curstate) { - case STOPMACHINE_DISABLE_IRQ: - local_irq_disable(); - hard_irq_disable(); - break; - case STOPMACHINE_RUN: - /* |= allows error detection if functions on - * multiple CPUs. */ - smdata->fnret |= smdata->fn(smdata->data); - break; - default: - break; - } - ack_state(); - } - } while (curstate != STOPMACHINE_EXIT); - - local_irq_enable(); - do_exit(0); -} - -/* Callback for CPUs which aren't supposed to do anything. */ -static int chill(void *unused) -{ - return 0; -} - -int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) -{ - int i, err; - struct stop_machine_data active, idle; - struct task_struct **threads; - - active.fn = fn; - active.data = data; - active.fnret = 0; - idle.fn = chill; - idle.data = NULL; - - /* This could be too big for stack on large machines. */ - threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); - if (!threads) - return -ENOMEM; - - /* Set up initial state. */ - mutex_lock(&lock); - init_completion(&finished); - num_threads = num_online_cpus(); - set_state(STOPMACHINE_PREPARE); - - for_each_online_cpu(i) { - struct stop_machine_data *smdata = &idle; - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - if (!cpus) { - if (i == first_cpu(cpu_online_map)) - smdata = &active; - } else { - if (cpu_isset(i, *cpus)) - smdata = &active; - } - - threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", - i); - if (IS_ERR(threads[i])) { - err = PTR_ERR(threads[i]); - threads[i] = NULL; - goto kill_threads; - } - - /* Place it onto correct cpu. */ - kthread_bind(threads[i], i); - - /* Make it highest prio. */ - if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) - BUG(); - } - - /* We've created all the threads. Wake them all: hold this CPU so one - * doesn't hit this CPU until we're ready. */ - get_cpu(); - for_each_online_cpu(i) - wake_up_process(threads[i]); - - /* This will release the thread on our CPU. */ - put_cpu(); - wait_for_completion(&finished); - mutex_unlock(&lock); - - kfree(threads); - - return active.fnret; - -kill_threads: - for_each_online_cpu(i) - if (threads[i]) - kthread_stop(threads[i]); - mutex_unlock(&lock); - - kfree(threads); - return err; -} - -int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) -{ - int ret; - - /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); - - return ret; -} -EXPORT_SYMBOL_GPL(stop_machine); -/* - * linux/kernel/sys.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef GET_TSC_CTL -# define GET_TSC_CTL(a) (-EINVAL) -#endif -#ifndef SET_TSC_CTL -# define SET_TSC_CTL(a) (-EINVAL) -#endif - -/* - * this is where the system-wide overflow UID and GID are defined, for - * architectures that now have 32-bit UID/GID but didn't in the past - */ - -int overflowuid = DEFAULT_OVERFLOWUID; -int overflowgid = DEFAULT_OVERFLOWGID; - -#ifdef CONFIG_UID16 -EXPORT_SYMBOL(overflowuid); -EXPORT_SYMBOL(overflowgid); -#endif - -/* - * the same as above, but for filesystems which can only store a 16-bit - * UID and GID. as such, this is needed on all architectures - */ - -int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; - -EXPORT_SYMBOL(fs_overflowuid); -EXPORT_SYMBOL(fs_overflowgid); - -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -static int set_one_prio(struct task_struct *p, int niceval, int error) -{ - int no_nice; - - if (p->uid != current->euid && - p->euid != current->euid && !capable(CAP_SYS_NICE)) { - error = -EPERM; - goto out; - } - if (niceval < task_nice(p) && !can_nice(p, niceval)) { - error = -EACCES; - goto out; - } - no_nice = security_task_setnice(p, niceval); - if (no_nice) { - error = no_nice; - goto out; - } - if (error == -ESRCH) - error = 0; - set_user_nice(p, niceval); -out: - return error; -} - -SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) -{ - struct task_struct *g, *p; - struct user_struct *user; - int error = -EINVAL; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - goto out; - - /* normalize: avoid signed division (rounding problems) */ - error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; - - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = current->user; - if (!who) - who = current->uid; - else - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) - if (p->uid == who) - error = set_one_prio(p, niceval, error); - while_each_thread(g, p); - if (who != current->uid) - free_uid(user); /* For find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); -out: - return error; -} - -/* - * Ugh. To avoid negative return values, "getpriority()" will - * not return the normal nice-value, but a negated value that - * has been offset by 20 (ie it returns 40..1 instead of -20..19) - * to stay compatible. - */ -SYSCALL_DEFINE2(getpriority, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - long niceval, retval = -ESRCH; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - return -EINVAL; - - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = current->user; - if (!who) - who = current->uid; - else - if ((who != current->uid) && !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) - if (p->uid == who) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - while_each_thread(g, p); - if (who != current->uid) - free_uid(user); /* for find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); - - return retval; -} - -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - device_shutdown(); - sysdev_shutdown(); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - sysdev_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - disable_nonboot_cpus(); - sysdev_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - char buffer[256]; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - lock_kernel(); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - unlock_kernel(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - { - int ret; - ret = kernel_kexec(); - unlock_kernel(); - return ret; - } -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - { - int ret = hibernate(); - unlock_kernel(); - return ret; - } -#endif - - default: - unlock_kernel(); - return -EINVAL; - } - unlock_kernel(); - return 0; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - -/* - * Unprivileged users may change the real gid to the effective gid - * or vice versa. (BSD-style) - * - * If you set the real gid at all, or set the effective gid to a value not - * equal to the real gid, then the saved gid is set to the new effective gid. - * - * This makes it possible for a setgid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setregid() will be - * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. - * - * SMP: There are not races, the GIDs are checked only by filesystem - * operations (as far as semantic preservation is concerned). - */ -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) -{ - int old_rgid = current->gid; - int old_egid = current->egid; - int new_rgid = old_rgid; - int new_egid = old_egid; - int retval; - - retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); - if (retval) - return retval; - - if (rgid != (gid_t) -1) { - if ((old_rgid == rgid) || - (current->egid==rgid) || - capable(CAP_SETGID)) - new_rgid = rgid; - else - return -EPERM; - } - if (egid != (gid_t) -1) { - if ((old_rgid == egid) || - (current->egid == egid) || - (current->sgid == egid) || - capable(CAP_SETGID)) - new_egid = egid; - else - return -EPERM; - } - if (new_egid != old_egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old_rgid)) - current->sgid = new_egid; - current->fsgid = new_egid; - current->egid = new_egid; - current->gid = new_rgid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; -} - -/* - * setgid() is implemented like SysV w/ SAVED_IDS - * - * SMP: Same implicit races as above. - */ -SYSCALL_DEFINE1(setgid, gid_t, gid) -{ - int old_egid = current->egid; - int retval; - - retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); - if (retval) - return retval; - - if (capable(CAP_SETGID)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->gid = current->egid = current->sgid = current->fsgid = gid; - } else if ((gid == current->gid) || (gid == current->sgid)) { - if (old_egid != gid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->egid = current->fsgid = gid; - } - else - return -EPERM; - - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; -} - -static int set_user(uid_t new_ruid, int dumpclear) -{ - struct user_struct *new_user; - - new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); - if (!new_user) - return -EAGAIN; - - if (atomic_read(&new_user->processes) >= - current->signal->rlim[RLIMIT_NPROC].rlim_cur && - new_user != current->nsproxy->user_ns->root_user) { - free_uid(new_user); - return -EAGAIN; - } - - switch_uid(new_user); - - if (dumpclear) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->uid = new_ruid; - return 0; -} - -/* - * Unprivileged users may change the real uid to the effective uid - * or vice versa. (BSD-style) - * - * If you set the real uid at all, or set the effective uid to a value not - * equal to the real uid, then the saved uid is set to the new effective uid. - * - * This makes it possible for a setuid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setreuid() will be - * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. - */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) -{ - int old_ruid, old_euid, old_suid, new_ruid, new_euid; - int retval; - - retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); - if (retval) - return retval; - - new_ruid = old_ruid = current->uid; - new_euid = old_euid = current->euid; - old_suid = current->suid; - - if (ruid != (uid_t) -1) { - new_ruid = ruid; - if ((old_ruid != ruid) && - (current->euid != ruid) && - !capable(CAP_SETUID)) - return -EPERM; - } - - if (euid != (uid_t) -1) { - new_euid = euid; - if ((old_ruid != euid) && - (current->euid != euid) && - (current->suid != euid) && - !capable(CAP_SETUID)) - return -EPERM; - } - - if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) - return -EAGAIN; - - if (new_euid != old_euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->fsuid = current->euid = new_euid; - if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old_ruid)) - current->suid = current->euid; - current->fsuid = current->euid; - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); -} - - - -/* - * setuid() is implemented like SysV with SAVED_IDS - * - * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal - * user and then switch back, because if you're root, setuid() sets - * the saved uid too. If you don't like this, blame the bright people - * in the POSIX committee and/or USG. Note that the BSD-style setreuid() - * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. - */ -SYSCALL_DEFINE1(setuid, uid_t, uid) -{ - int old_euid = current->euid; - int old_ruid, old_suid, new_suid; - int retval; - - retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); - if (retval) - return retval; - - old_ruid = current->uid; - old_suid = current->suid; - new_suid = old_suid; - - if (capable(CAP_SETUID)) { - if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) - return -EAGAIN; - new_suid = uid; - } else if ((uid != current->uid) && (uid != new_suid)) - return -EPERM; - - if (old_euid != uid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->fsuid = current->euid = uid; - current->suid = new_suid; - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); -} - - -/* - * This function implements a generic ability to update ruid, euid, - * and suid. This allows you to implement the 4.4 compatible seteuid(). - */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) -{ - int old_ruid = current->uid; - int old_euid = current->euid; - int old_suid = current->suid; - int retval; - - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - return retval; - - if (!capable(CAP_SETUID)) { - if ((ruid != (uid_t) -1) && (ruid != current->uid) && - (ruid != current->euid) && (ruid != current->suid)) - return -EPERM; - if ((euid != (uid_t) -1) && (euid != current->uid) && - (euid != current->euid) && (euid != current->suid)) - return -EPERM; - if ((suid != (uid_t) -1) && (suid != current->uid) && - (suid != current->euid) && (suid != current->suid)) - return -EPERM; - } - if (ruid != (uid_t) -1) { - if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) - return -EAGAIN; - } - if (euid != (uid_t) -1) { - if (euid != current->euid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->euid = euid; - } - current->fsuid = current->euid; - if (suid != (uid_t) -1) - current->suid = suid; - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); -} - -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) -{ - int retval; - - if (!(retval = put_user(current->uid, ruid)) && - !(retval = put_user(current->euid, euid))) - retval = put_user(current->suid, suid); - - return retval; -} - -/* - * Same as above, but for rgid, egid, sgid. - */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) -{ - int retval; - - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - return retval; - - if (!capable(CAP_SETGID)) { - if ((rgid != (gid_t) -1) && (rgid != current->gid) && - (rgid != current->egid) && (rgid != current->sgid)) - return -EPERM; - if ((egid != (gid_t) -1) && (egid != current->gid) && - (egid != current->egid) && (egid != current->sgid)) - return -EPERM; - if ((sgid != (gid_t) -1) && (sgid != current->gid) && - (sgid != current->egid) && (sgid != current->sgid)) - return -EPERM; - } - if (egid != (gid_t) -1) { - if (egid != current->egid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->egid = egid; - } - current->fsgid = current->egid; - if (rgid != (gid_t) -1) - current->gid = rgid; - if (sgid != (gid_t) -1) - current->sgid = sgid; - - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - return 0; -} - -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) -{ - int retval; - - if (!(retval = put_user(current->gid, rgid)) && - !(retval = put_user(current->egid, egid))) - retval = put_user(current->sgid, sgid); - - return retval; -} - - -/* - * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This - * is used for "access()" and for the NFS daemon (letting nfsd stay at - * whatever uid it wants to). It normally shadows "euid", except when - * explicitly set by setfsuid() or for access.. - */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) -{ - int old_fsuid; - - old_fsuid = current->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) - return old_fsuid; - - if (uid == current->uid || uid == current->euid || - uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) { - if (uid != old_fsuid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->fsuid = uid; - } - - key_fsuid_changed(current); - proc_id_connector(current, PROC_EVENT_UID); - - security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); - - return old_fsuid; -} - -/* - * Samma pÃ¥ svenska.. - */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) -{ - int old_fsgid; - - old_fsgid = current->fsgid; - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - return old_fsgid; - - if (gid == current->gid || gid == current->egid || - gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) { - if (gid != old_fsgid) { - set_dumpable(current->mm, suid_dumpable); - smp_wmb(); - } - current->fsgid = gid; - key_fsgid_changed(current); - proc_id_connector(current, PROC_EVENT_GID); - } - return old_fsgid; -} - -SYSCALL_DEFINE1(times, struct tms __user *, tbuf) -{ - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ - if (tbuf) { - struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); - if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) - return -EFAULT; - } - return (long) jiffies_64_to_clock_t(get_jiffies_64()); -} - -/* - * This needs some heavy checking ... - * I just haven't the stomach for it. I also don't fully - * understand sessions/pgrp etc. Let somebody who does explain it. - * - * OK, I think I have the protection semantics right.... this is really - * only important on a multi-user system anyway, to make sure one user - * can't send a signal to a process owned by another. -TYT, 12/12/91 - * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 - */ -SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) -{ - struct task_struct *p; - struct task_struct *group_leader = current->group_leader; - struct pid *pgrp; - int err; - - if (!pid) - pid = task_pid_vnr(group_leader); - if (!pgid) - pgid = pid; - if (pgid < 0) - return -EINVAL; - - /* From this point forward we keep holding onto the tasklist lock - * so that our parent does not change from under us. -DaveM - */ - write_lock_irq(&tasklist_lock); - - err = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - - err = -EINVAL; - if (!thread_group_leader(p)) - goto out; - - if (same_thread_group(p->real_parent, group_leader)) { - err = -EPERM; - if (task_session(p) != task_session(group_leader)) - goto out; - err = -EACCES; - if (p->did_exec) - goto out; - } else { - err = -ESRCH; - if (p != group_leader) - goto out; - } - - err = -EPERM; - if (p->signal->leader) - goto out; - - pgrp = task_pid(p); - if (pgid != pid) { - struct task_struct *g; - - pgrp = find_vpid(pgid); - g = pid_task(pgrp, PIDTYPE_PGID); - if (!g || task_session(g) != task_session(group_leader)) - goto out; - } - - err = security_task_setpgid(p, pgid); - if (err) - goto out; - - if (task_pgrp(p) != pgrp) { - change_pid(p, PIDTYPE_PGID, pgrp); - set_task_pgrp(p, pid_nr(pgrp)); - } - - err = 0; -out: - /* All paths lead to here, thus we are safe. -DaveM */ - write_unlock_irq(&tasklist_lock); - return err; -} - -SYSCALL_DEFINE1(getpgid, pid_t, pid) -{ - struct task_struct *p; - struct pid *grp; - int retval; - - rcu_read_lock(); - if (!pid) - grp = task_pgrp(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - grp = task_pgrp(p); - if (!grp) - goto out; - - retval = security_task_getpgid(p); - if (retval) - goto out; - } - retval = pid_vnr(grp); -out: - rcu_read_unlock(); - return retval; -} - -#ifdef __ARCH_WANT_SYS_GETPGRP - -SYSCALL_DEFINE0(getpgrp) -{ - return sys_getpgid(0); -} - -#endif - -SYSCALL_DEFINE1(getsid, pid_t, pid) -{ - struct task_struct *p; - struct pid *sid; - int retval; - - rcu_read_lock(); - if (!pid) - sid = task_session(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - sid = task_session(p); - if (!sid) - goto out; - - retval = security_task_getsid(p); - if (retval) - goto out; - } - retval = pid_vnr(sid); -out: - rcu_read_unlock(); - return retval; -} - -SYSCALL_DEFINE0(setsid) -{ - struct task_struct *group_leader = current->group_leader; - struct pid *sid = task_pid(group_leader); - pid_t session = pid_vnr(sid); - int err = -EPERM; - - write_lock_irq(&tasklist_lock); - /* Fail if I am already a session leader */ - if (group_leader->signal->leader) - goto out; - - /* Fail if a process group id already exists that equals the - * proposed session id. - */ - if (pid_task(sid, PIDTYPE_PGID)) - goto out; - - group_leader->signal->leader = 1; - __set_special_pids(sid); - - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); - - err = session; -out: - write_unlock_irq(&tasklist_lock); - return err; -} - -/* - * Supplementary group IDs - */ - -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - -struct group_info *groups_alloc(int gidsetsize) -{ - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) - return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - gid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; - -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; -} - -EXPORT_SYMBOL(groups_alloc); - -void groups_free(struct group_info *group_info) -{ - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); -} - -EXPORT_SYMBOL(groups_free); - -/* export the group_info to a user-space array */ -static int groups_to_user(gid_t __user *grouplist, - struct group_info *group_info) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* fill a group_info from a user-space array - it must be allocated already */ -static int groups_from_user(struct group_info *group_info, - gid_t __user *grouplist) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_from_user(group_info->blocks[i], grouplist, len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* a simple Shell sort */ -static void groups_sort(struct group_info *group_info) -{ - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); - - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); - right = left; - left -= stride; - } - GROUP_AT(group_info, right) = tmp; - } - stride /= 3; - } -} - -/* a simple bsearch */ -int groups_search(struct group_info *group_info, gid_t grp) -{ - unsigned int left, right; - - if (!group_info) - return 0; - - left = 0; - right = group_info->ngroups; - while (left < right) { - unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) - left = mid + 1; - else if (cmp < 0) - right = mid; - else - return 1; - } - return 0; -} - -/* validate and set current->group_info */ -int set_current_groups(struct group_info *group_info) -{ - int retval; - struct group_info *old_info; - - retval = security_task_setgroups(group_info); - if (retval) - return retval; - - groups_sort(group_info); - get_group_info(group_info); - - task_lock(current); - old_info = current->group_info; - current->group_info = group_info; - task_unlock(current); - - put_group_info(old_info); - - return 0; -} - -EXPORT_SYMBOL(set_current_groups); - -SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - int i = 0; - - /* - * SMP: Nobody else can change our grouplist. Thus we are - * safe. - */ - - if (gidsetsize < 0) - return -EINVAL; - - /* no need to grab task_lock here; it cannot change */ - i = current->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups_to_user(grouplist, current->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - return i; -} - -/* - * SMP: Our groups are copy-on-write. We can set them safely - * without another task interfering. - */ - -SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - struct group_info *group_info; - int retval; - - if (!capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } - - retval = set_current_groups(group_info); - put_group_info(group_info); - - return retval; -} - -/* - * Check whether we're fsgid/egid or in the supplemental group.. - */ -int in_group_p(gid_t grp) -{ - int retval = 1; - if (grp != current->fsgid) - retval = groups_search(current->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_group_p); - -int in_egroup_p(gid_t grp) -{ - int retval = 1; - if (grp != current->egid) - retval = groups_search(current->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_egroup_p); - -DECLARE_RWSEM(uts_sem); - -SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) -{ - int errno = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -} - -SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; - errno = 0; - } - up_write(&uts_sem); - return errno; -} - -#ifdef __ARCH_WANT_SYS_GETHOSTNAME - -SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) -{ - int i, errno; - - if (len < 0) - return -EINVAL; - down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); - if (i > len) - i = len; - errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -} - -#endif - -/* - * Only setdomainname; getdomainname can be implemented by calling - * uname() - */ -SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; - errno = 0; - } - up_write(&uts_sem); - return errno; -} - -SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } -} - -#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT - -/* - * Back compatibility for getrlimit. Needed for some apps. - */ - -SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct rlimit __user *, rlim) -{ - struct rlimit x; - if (resource >= RLIM_NLIMITS) - return -EINVAL; - - task_lock(current->group_leader); - x = current->signal->rlim[resource]; - task_unlock(current->group_leader); - if (x.rlim_cur > 0x7FFFFFFF) - x.rlim_cur = 0x7FFFFFFF; - if (x.rlim_max > 0x7FFFFFFF) - x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; -} - -#endif - -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; - int retval; - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - - retval = security_task_setrlimit(resource, &new_rlim); - if (retval) - return retval; - - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim.rlim_cur = 1; - } - - task_lock(current->group_leader); - *old_rlim = new_rlim; - task_unlock(current->group_leader); - - if (resource != RLIMIT_CPU) - goto out; - - /* - * RLIMIT_CPU handling. Note that the kernel fails to return an error - * code if it rejected the user's attempt to set RLIMIT_CPU. This is a - * very long-standing error, and fixing it now risks breakage of - * applications, so we live with it - */ - if (new_rlim.rlim_cur == RLIM_INFINITY) - goto out; - - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } -out: - return 0; -} - -/* - * It would make sense to put struct rusage in the task_struct, - * except that would make the task_struct be *really big*. After - * task_struct gets moved into malloc'ed memory, it would - * make sense to do this. It will make moving the rest of the information - * a lot simpler! (Which we're not doing right now because we're not - * measuring them yet). - * - * When sampling multiple threads for RUSAGE_SELF, under SMP we might have - * races with threads incrementing their own counters. But since word - * reads are atomic, we either get new values or old values and we don't - * care which for the sums. We always take the siglock to protect reading - * the c* fields from p->signal from races with exit.c updating those - * fields when reaping, so a sample either gets all the additions of a - * given child after it's reaped, or none so this sample is before reaping. - * - * Locking: - * We need to take the siglock for CHILDEREN, SELF and BOTH - * for the cases current multithreaded, non-current single threaded - * non-current multithreaded. Thread traversal is now safe with - * the siglock held. - * Strictly speaking, we donot need to take the siglock if we are current and - * single threaded, as no one else can take our signal_struct away, no one - * else can reap the children to update signal->c* counters, and no one else - * can race with the signal-> fields. If we do not take any lock, the - * signal-> fields could be read out of order while another thread was just - * exiting. So we should place a read memory barrier when we avoid the lock. - * On the writer side, write memory barrier is implied in __exit_signal - * as __exit_signal releases the siglock spinlock after updating the signal-> - * fields. But we don't do this yet to keep things simple. - * - */ - -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) -{ - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); -} - -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) -{ - struct task_struct *t; - unsigned long flags; - cputime_t utime, stime; - - memset((char *) r, 0, sizeof *r); - utime = stime = cputime_zero; - - if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); - goto out; - } - - if (!lock_task_sighand(p, &flags)) - return; - - switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - t = p; - do { - accumulate_thread_rusage(t, r, &utime, &stime); - t = next_thread(t); - } while (t != p); - break; - - default: - BUG(); - } - unlock_task_sighand(p, &flags); - -out: - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); -} - -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) -{ - struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && - who != RUSAGE_THREAD) - return -EINVAL; - return getrusage(current, who, ru); -} - -SYSCALL_DEFINE1(umask, int, mask) -{ - mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); - return mask; -} - -SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, - unsigned long, arg4, unsigned long, arg5) -{ - long error = 0; - - if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) - return error; - - switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - current->pdeath_signal = arg2; - break; - case PR_GET_PDEATHSIG: - error = put_user(current->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(current->mm); - break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(current->mm, arg2); - break; - - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(current, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(current, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(current, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(current, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(current, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(current, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - break; - - case PR_SET_NAME: { - struct task_struct *me = current; - unsigned char ncomm[sizeof(me->comm)]; - - ncomm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(ncomm, (char __user *)arg2, - sizeof(me->comm)-1) < 0) - return -EFAULT; - set_task_comm(me, ncomm); - return 0; - } - case PR_GET_NAME: { - struct task_struct *me = current; - unsigned char tcomm[sizeof(me->comm)]; - - get_task_comm(tcomm, me); - if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) - return -EFAULT; - return 0; - } - case PR_GET_ENDIAN: - error = GET_ENDIAN(current, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(current, arg2); - break; - - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - default: - error = -EINVAL; - break; - } - return error; -} - -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) -{ - int err = 0; - int cpu = raw_smp_processor_id(); - if (cpup) - err |= put_user(cpu, cpup); - if (nodep) - err |= put_user(cpu_to_node(cpu), nodep); - return err ? -EFAULT : 0; -} - -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static void argv_cleanup(char **argv, char **envp) -{ - argv_free(argv); -} - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret = -ENOMEM; - struct subprocess_info *info; - - if (argv == NULL) { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - goto out; - } - - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setcleanup(info, argv_cleanup); - - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); - - out: - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - -#include -#include - -#include - -/* we can't #include here, - but tell gcc to not warn with -Wmissing-prototypes */ -asmlinkage long sys_ni_syscall(void); - -/* - * Non-implemented system calls get redirected here. - */ -asmlinkage long sys_ni_syscall(void) -{ - return -ENOSYS; -} - -cond_syscall(sys_nfsservctl); -cond_syscall(sys_quotactl); -cond_syscall(sys32_quotactl); -cond_syscall(sys_acct); -cond_syscall(sys_lookup_dcookie); -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); -cond_syscall(sys_kexec_load); -cond_syscall(compat_sys_kexec_load); -cond_syscall(sys_init_module); -cond_syscall(sys_delete_module); -cond_syscall(sys_socketpair); -cond_syscall(sys_bind); -cond_syscall(sys_listen); -cond_syscall(sys_accept); -cond_syscall(sys_paccept); -cond_syscall(sys_connect); -cond_syscall(sys_getsockname); -cond_syscall(sys_getpeername); -cond_syscall(sys_sendto); -cond_syscall(sys_send); -cond_syscall(sys_recvfrom); -cond_syscall(sys_recv); -cond_syscall(sys_socket); -cond_syscall(sys_setsockopt); -cond_syscall(compat_sys_setsockopt); -cond_syscall(sys_getsockopt); -cond_syscall(compat_sys_getsockopt); -cond_syscall(sys_shutdown); -cond_syscall(sys_sendmsg); -cond_syscall(compat_sys_sendmsg); -cond_syscall(sys_recvmsg); -cond_syscall(compat_sys_recvmsg); -cond_syscall(sys_socketcall); -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); -cond_syscall(sys_epoll_create); -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_wait); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); -cond_syscall(sys_semget); -cond_syscall(sys_semop); -cond_syscall(sys_semtimedop); -cond_syscall(sys_semctl); -cond_syscall(sys_msgget); -cond_syscall(sys_msgsnd); -cond_syscall(sys_msgrcv); -cond_syscall(sys_msgctl); -cond_syscall(sys_shmget); -cond_syscall(sys_shmat); -cond_syscall(sys_shmdt); -cond_syscall(sys_shmctl); -cond_syscall(sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_open); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(compat_sys_mq_notify); -cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_mbind); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(compat_sys_set_mempolicy); -cond_syscall(sys_add_key); -cond_syscall(sys_request_key); -cond_syscall(sys_keyctl); -cond_syscall(compat_sys_keyctl); -cond_syscall(compat_sys_socketcall); -cond_syscall(sys_inotify_init); -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); -cond_syscall(sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); -cond_syscall(sys_vm86old); -cond_syscall(sys_vm86); -cond_syscall(compat_sys_ipc); -cond_syscall(compat_sys_sysctl); -cond_syscall(sys_syslog); - -/* arch-specific weak syscall entries */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); -cond_syscall(sys32_ipc); -cond_syscall(sys32_sysctl); -cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); - -/* mmu depending weak syscall entries */ -cond_syscall(sys_mprotect); -cond_syscall(sys_msync); -cond_syscall(sys_mlock); -cond_syscall(sys_munlock); -cond_syscall(sys_mlockall); -cond_syscall(sys_munlockall); -cond_syscall(sys_mincore); -cond_syscall(sys_madvise); -cond_syscall(sys_mremap); -cond_syscall(sys_remap_file_pages); -cond_syscall(compat_sys_move_pages); -cond_syscall(compat_sys_migrate_pages); - -/* block-layer dependent */ -cond_syscall(sys_bdflush); -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); - -/* New file descriptors */ -cond_syscall(sys_signalfd); -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_signalfd4); -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_gettime); -cond_syscall(sys_eventfd); -cond_syscall(sys_eventfd2); -/* - * sysctl.c: General linux system control interface - * - * Begun 24 March 1995, Stephen Tweedie - * Added /proc support, Dec 1995 - * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. - * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. - * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. - * Dynamic registration fixes, Stephen Tweedie. - * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. - * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris - * Horn. - * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. - * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. - * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill - * Wendling. - * The list_for_each() macro wasn't appropriate for the sysctl loop. - * Removed it and replaced it with older style, 03/23/00, Bill Wendling - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_X86 -#include -#include -#include -#endif - -static int deprecated_sysctl_warning(struct __sysctl_args *args); - -#if defined(CONFIG_SYSCTL) - -/* External variables not in a header file. */ -extern int C_A_D; -extern int print_fatal_signals; -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern int sysctl_panic_on_oom; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_oom_dump_tasks; -extern int max_threads; -extern int core_uses_pid; -extern int suid_dumpable; -extern char core_pattern[]; -extern int pid_max; -extern int min_free_kbytes; -extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; -extern int percpu_pagelist_fraction; -extern int compat_log; -extern int maps_protect; -extern int latencytop_enabled; -extern int sysctl_nr_open_min, sysctl_nr_open_max; -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ - -/* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - -#ifdef CONFIG_DETECT_SOFTLOCKUP -static int sixty = 60; -static int neg_one = -1; -#endif - -#ifdef CONFIG_MMU -static int two = 2; -#endif - -static int zero; -static int one_hundred = 100; - -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; -static int min_percpu_pagelist_fract = 8; - -static int ngroups_max = NGROUPS_MAX; - -#ifdef CONFIG_MODULES -extern char modprobe_path[]; -#endif -#ifdef CONFIG_CHR_DEV_SG -extern int sg_big_buff; -#endif - -#ifdef __sparc__ -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; -#endif - -#ifdef __hppa__ -extern int pwrsw_enabled; -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU -extern int sysctl_ieee_emulation_warnings; -#endif -extern int sysctl_userprocess_debug; -extern int spin_retry; -#endif - -#ifdef CONFIG_BSD_PROCESS_ACCT -extern int acct_parm[]; -#endif - -#ifdef CONFIG_IA64 -extern int no_unaligned_warning; -#endif - -#ifdef CONFIG_RT_MUTEXES -extern int max_lock_depth; -#endif - -#ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -static struct ctl_table root_table[]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - .count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_INOTIFY_USER -extern struct ctl_table inotify_table[]; -#endif -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT -int sysctl_legacy_va_layout; -#endif - -extern int prove_locking; -extern int lock_stat; - -/* The default sysctl tables: */ - -static struct ctl_table root_table[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .ctl_name = CTL_VM, - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .ctl_name = CTL_FS, - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .ctl_name = CTL_DEV, - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ - { .ctl_name = 0 } -}; - -#ifdef CONFIG_SCHED_DEBUG -static int min_sched_granularity_ns = 100000; /* 100 usecs */ -static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_wakeup_granularity_ns; /* 0 usecs */ -static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -#endif - -static struct ctl_table kern_table[] = { -#ifdef CONFIG_SCHED_DEBUG - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_migration_cost", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &sched_rt_handler, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &sched_rt_handler, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_PROVE_LOCKING - { - .ctl_name = CTL_UNNUMBERED, - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .ctl_name = CTL_UNNUMBERED, - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = KERN_PANIC, - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_CORE_USES_PID, - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_CORE_PATTERN, - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_taint, - }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .ctl_name = KERN_REALROOTDEV, - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef __sparc__ - { - .ctl_name = KERN_SPARC_REBOOT, - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_SPARC_STOP_A, - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_SPARC_SCONS_PWROFF, - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef __hppa__ - { - .ctl_name = KERN_HPPA_PWRSW, - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_HPPA_UNALIGNED, - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = KERN_CTLALTDEL, - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_FTRACE - { - .ctl_name = CTL_UNNUMBERED, - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ftrace_enable_sysctl, - }, -#endif -#ifdef CONFIG_MODULES - { - .ctl_name = KERN_MODPROBE, - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, -#endif -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) - { - .ctl_name = KERN_HOTPLUG, - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, -#endif -#ifdef CONFIG_CHR_DEV_SG - { - .ctl_name = KERN_SG_BIG_BUFF, - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .ctl_name = KERN_ACCT, - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .ctl_name = KERN_SYSRQ, - .procname = "sysrq", - .data = &__sysrq_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = &proc_do_cad_pid, - }, -#endif - { - .ctl_name = KERN_MAX_THREADS, - .procname = "threads-max", - .data = &max_threads, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_RANDOM, - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { - .ctl_name = KERN_OVERFLOWUID, - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .ctl_name = KERN_OVERFLOWGID, - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .ctl_name = KERN_IEEE_EMULATION_WARNINGS, - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = KERN_S390_USER_DEBUG_LOGGING, - .procname = "userprocess_debug", - .data = &sysctl_userprocess_debug, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = KERN_PIDMAX, - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = sysctl_intvec, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { - .ctl_name = KERN_PANIC_ON_OOPS, - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#if defined CONFIG_PRINTK - { - .ctl_name = KERN_PRINTK, - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_PRINTK_RATELIMIT, - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = KERN_PRINTK_RATELIMIT_BURST, - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = KERN_NGROUPS_MAX, - .procname = "ngroups_max", - .data = &ngroups_max, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_nmi_enabled, - }, -#endif -#if defined(CONFIG_X86) - { - .ctl_name = KERN_PANIC_ON_NMI, - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = KERN_BOOTLOADER_TYPE, - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) - { - .ctl_name = KERN_RANDOMIZE, - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .ctl_name = KERN_SPIN_RETRY, - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_IA64 - { - .ctl_name = KERN_IA64_UNALIGNED, - .procname = "ignore-unaligned-usertrap", - .data = &no_unaligned_warning, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_DETECT_SOFTLOCKUP - { - .ctl_name = CTL_UNNUMBERED, - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &neg_one, - .extra2 = &sixty, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, - }, -#endif -#ifdef CONFIG_COMPAT - { - .ctl_name = KERN_COMPAT_LOG, - .procname = "compat-log", - .data = &compat_log, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_RT_MUTEXES - { - .ctl_name = KERN_MAX_LOCK_DEPTH, - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_PROC_FS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "maps_protect", - .data = &maps_protect, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, -#ifdef CONFIG_KEYS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif -#ifdef CONFIG_RCU_TORTURE_TEST - { - .ctl_name = CTL_UNNUMBERED, - .procname = "rcutorture_runnable", - .data = &rcutorture_runnable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ - { .ctl_name = 0 } -}; - -static struct ctl_table vm_table[] = { - { - .ctl_name = VM_OVERCOMMIT_MEMORY, - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = VM_PANIC_ON_OOM, - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = VM_OVERCOMMIT_RATIO, - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = VM_PAGE_CLUSTER, - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = VM_DIRTY_BACKGROUND, - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .ctl_name = VM_DIRTY_RATIO, - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = &dirty_ratio_handler, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = &dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = &proc_dointvec_userhz_jiffies, - }, - { - .ctl_name = VM_NR_PDFLUSH_THREADS, - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, - }, - { - .ctl_name = VM_HUGETLB_GROUP, - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &hugetlb_treat_movable_handler, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, - }, -#endif - { - .ctl_name = VM_LOWMEM_RESERVE_RATIO, - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = &lowmem_reserve_ratio_sysctl_handler, - .strategy = &sysctl_intvec, - }, - { - .ctl_name = VM_DROP_PAGECACHE, - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = drop_caches_sysctl_handler, - .strategy = &sysctl_intvec, - }, - { - .ctl_name = VM_MIN_FREE_KBYTES, - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = &min_free_kbytes_sysctl_handler, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, - { - .ctl_name = VM_PERCPU_PAGELIST_FRACTION, - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = &percpu_pagelist_fraction_sysctl_handler, - .strategy = &sysctl_intvec, - .extra1 = &min_percpu_pagelist_fract, - }, -#ifdef CONFIG_MMU - { - .ctl_name = VM_MAX_MAP_COUNT, - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = &proc_dointvec - }, -#endif - { - .ctl_name = VM_LAPTOP_MODE, - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = VM_BLOCK_DUMP, - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, - { - .ctl_name = VM_VFS_CACHE_PRESSURE, - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT - { - .ctl_name = VM_LEGACY_VA_LAYOUT, - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, -#endif -#ifdef CONFIG_NUMA - { - .ctl_name = VM_ZONE_RECLAIM_MODE, - .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, - { - .ctl_name = VM_MIN_UNMAPPED, - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .ctl_name = VM_MIN_SLAB, - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -#endif -#ifdef CONFIG_SMP - { - .ctl_name = CTL_UNNUMBERED, - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - }, -#ifdef CONFIG_NUMA - { - .ctl_name = CTL_UNNUMBERED, - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = &numa_zonelist_order_handler, - .strategy = &sysctl_string, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .ctl_name = VM_VDSO_ENABLED, - .procname = "vdso_enabled", - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, -#endif -#ifdef CONFIG_HIGHMEM - { - .ctl_name = CTL_UNNUMBERED, - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one, - }, -#endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ - { .ctl_name = 0 } -}; - -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) -static struct ctl_table binfmt_misc_table[] = { - { .ctl_name = 0 } -}; -#endif - -static struct ctl_table fs_table[] = { - { - .ctl_name = FS_NRINODE, - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = FS_STATINODE, - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = 3*sizeof(int), - .mode = 0444, - .proc_handler = &proc_nr_files, - }, - { - .ctl_name = FS_MAXFILE, - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .ctl_name = FS_DENTRY, - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = FS_OVERFLOWUID, - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .ctl_name = FS_OVERFLOWGID, - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .ctl_name = FS_LEASES, - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_DNOTIFY - { - .ctl_name = FS_DIR_NOTIFY, - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU - { - .ctl_name = FS_LEASE_TIME, - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &two, - }, - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - }, -#ifdef CONFIG_INOTIFY_USER - { - .ctl_name = FS_INOTIFY, - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .ctl_name = KERN_SETUID_DUMPABLE, - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .ctl_name = CTL_UNNUMBERED, - .procname = "binfmt_misc", - .mode = 0555, - .child = binfmt_misc_table, - }, -#endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ - { .ctl_name = 0 } -}; - -static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) - { - .ctl_name = CTL_UNNUMBERED, - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { .ctl_name = 0 } -}; - -static struct ctl_table dev_table[] = { - { .ctl_name = 0 } -}; - -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree(head); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -#ifdef CONFIG_SYSCTL_SYSCALL -/* Perform the actual read/write of a sysctl table entry. */ -static int do_sysctl_strategy(struct ctl_table_root *root, - struct ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= MAY_READ; - if (newval) - op |= MAY_WRITE; - if (sysctl_perm(root, table, op)) - return -EPERM; - - if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } - - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - } - return 0; -} - -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table_root *root, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(root, table, MAY_EXEC)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(root, table, name, nlen, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} - -int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table_header *head; - int error = -ENOTDIR; - - if (nlen <= 0 || nlen >= CTL_MAXNAME) - return -ENOTDIR; - if (oldval) { - int old_len; - if (!oldlenp || get_user(old_len, oldlenp)) - return -EFAULT; - } - - for (head = sysctl_head_next(NULL); head; - head = sysctl_head_next(head)) { - error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, - head->root, head->ctl_table); - if (error != -ENOTDIR) { - sysctl_head_finish(head); - break; - } - } - return error; -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - if (error) - goto out; - - lock_kernel(); - error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, - tmp.newval, tmp.newlen); - unlock_kernel(); -out: - return error; -} -#endif /* CONFIG_SYSCTL_SYSCALL */ - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current->euid) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int error; - int mode; - - error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); - if (error) - return error; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->ctl_name || table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - -static __init int sysctl_init(void) -{ - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - { - int err; - err = sysctl_check_table(current->nsproxy, root_table); - } -#endif - return 0; -} - -core_initcall(sysctl_init); - -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname || branch[1].ctl_name) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname || p->ctl_name; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * ctl_name - This is the numeric sysctl value used by sysctl(2). The number - * must be unique within that level of sysctl - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * strategy - the strategy routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * More sophisticated management can be enabled by the provision of a - * strategy routine with the table entry. This will be called before - * any automatic read or write of the data is performed. - * - * The strategy routine may return - * - * < 0 - Error occurred (error is passed to user process) - * - * 0 - OK - proceed with automatic read or write. - * - * > 0 - OK - read or write has been done by the strategy routine, so - * return immediately. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (ctl_name == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->ctl_name = path->ctl_name; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree(header->parent); - } - if (!--header->count) - kfree(header); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - -#endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - -#ifdef CONFIG_PROC_SYSCTL - -static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - size_t len; - char __user *p; - char c; - - if (!data || !maxlen || !*lenp) { - *lenp = 0; - return 0; - } - - if (write) { - len = 0; - p = buffer; - while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; - if (c == 0 || c == '\n') - break; - len++; - } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; - ((char *) data)[len] = 0; - *ppos += *lenp; - } else { - len = strlen(data); - if (len > maxlen) - len = maxlen; - - if (*ppos > len) { - *lenp = 0; - return 0; - } - - data += *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - if(copy_to_user(buffer, data, len)) - return -EFAULT; - if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) - return -EFAULT; - len++; - } - *lenp = len; - *ppos += len; - } - return 0; -} - -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return _proc_do_string(table->data, table->maxlen, write, filp, - buffer, lenp, ppos); -} - - -static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - *valp = *negp ? -*lvalp : *lvalp; - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ -#define TMPBUFLEN 21 - int *i, vleft, first=1, neg, val; - unsigned long lval; - size_t left, len; - - char buf[TMPBUFLEN], *p; - char __user *s = buffer; - - if (!tbl_data || !table->maxlen || !*lenp || - (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - left = *lenp; - - if (!conv) - conv = do_proc_dointvec_conv; - - for (; left && vleft--; i++, first=0) { - if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } - if (!left) - break; - neg = 0; - len = left; - if (len > sizeof(buf) - 1) - len = sizeof(buf) - 1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') - break; - - lval = simple_strtoul(p, &p, 0); - - len = p-buf; - if ((len < left) && *p && !isspace(*p)) - break; - if (neg) - val = -val; - s += len; - left -= len; - - if (conv(&neg, &lval, i, 1, data)) - break; - } else { - p = buf; - if (!first) - *p++ = '\t'; - - if (conv(&neg, &lval, i, 0, data)) - break; - - sprintf(p, "%s%lu", neg ? "-" : "", lval); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; - } - } - - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } - if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } - } - if (write && first) - return -EINVAL; - *lenp -= left; - *ppos += *lenp; - return 0; -#undef TMPBUFLEN -} - -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(int *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, filp, - buffer, lenp, ppos, conv, data); -} - -/** - * proc_dointvec - read a vector of integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - NULL,NULL); -} - -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -/* - * Taint values can only be increased - */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int op; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); -} - -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - struct do_proc_dointvec_minmax_conv_param *param = data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) - return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -/** - * proc_dointvec_minmax - read a vector of integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ -#define TMPBUFLEN 21 - unsigned long *i, *min, *max, val; - int vleft, first=1, neg; - size_t len, left; - char buf[TMPBUFLEN], *p; - char __user *s = buffer; - - if (!data || !table->maxlen || !*lenp || - (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; - vleft = table->maxlen / sizeof(unsigned long); - left = *lenp; - - for (; left && vleft--; i++, min++, max++, first=0) { - if (write) { - while (left) { - char c; - if (get_user(c, s)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - s++; - } - if (!left) - break; - neg = 0; - len = left; - if (len > TMPBUFLEN-1) - len = TMPBUFLEN-1; - if (copy_from_user(buf, s, len)) - return -EFAULT; - buf[len] = 0; - p = buf; - if (*p == '-' && left > 1) { - neg = 1; - p++; - } - if (*p < '0' || *p > '9') - break; - val = simple_strtoul(p, &p, 0) * convmul / convdiv ; - len = p-buf; - if ((len < left) && *p && !isspace(*p)) - break; - if (neg) - val = -val; - s += len; - left -= len; - - if(neg) - continue; - if ((min && val < *min) || (max && val > *max)) - continue; - *i = val; - } else { - p = buf; - if (!first) - *p++ = '\t'; - sprintf(p, "%lu", convdiv * (*i) / convmul); - len = strlen(buf); - if (len > left) - len = left; - if(copy_to_user(s, buf, len)) - return -EFAULT; - left -= len; - s += len; - } - } - - if (!write && !first && left) { - if(put_user('\n', s)) - return -EFAULT; - left--, s++; - } - if (write) { - while (left) { - char c; - if (get_user(c, s++)) - return -EFAULT; - if (!isspace(c)) - break; - left--; - } - } - if (write && first) - return -EINVAL; - *lenp -= left; - *ppos += *lenp; - return 0; -#undef TMPBUFLEN -} - -static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); -} - -/** - * proc_doulongvec_minmax - read a vector of long integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, filp, buffer, - lenp, ppos, HZ, 1000l); -} - - -static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > LONG_MAX / HZ) - return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = -1; - lval = (unsigned long)-val; - } else { - *negp = 0; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = -1; - lval = (unsigned long)-val; - } else { - *negp = 0; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = -1; - lval = (unsigned long)-val; - } else { - *negp = 0; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); -} - -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - -#else /* CONFIG_PROC_FS */ - -int proc_dostring(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - - -#endif /* CONFIG_PROC_FS */ - - -#ifdef CONFIG_SYSCTL_SYSCALL -/* - * General sysctl support routines - */ - -/* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, table->data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(table->data, newval, newlen)) - return -EFAULT; - } - return 1; -} - -/* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - size_t bufsize; - if (get_user(bufsize, oldlenp)) - return -EFAULT; - if (bufsize) { - size_t len = strlen(table->data), copied; - - /* This shouldn't trigger for a well-formed sysctl */ - if (len > table->maxlen) - len = table->maxlen; - - /* Copy up to a max of bufsize-1 bytes of the string */ - copied = (len >= bufsize) ? bufsize - 1 : len; - - if (copy_to_user(oldval, table->data, copied) || - put_user(0, (char __user *)(oldval + copied))) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - size_t len = newlen; - if (len > table->maxlen) - len = table->maxlen; - if(copy_from_user(table->data, newval, len)) - return -EFAULT; - if (len == table->maxlen) - len--; - ((char *) table->data)[len] = 0; - } - return 1; -} - -/* - * This function makes sure that all of the integers in the vector - * are between the minimum and maximum values given in the arrays - * table->extra1 and table->extra2, respectively. - */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - - if (newval && newlen) { - int __user *vec = (int __user *) newval; - int *min = (int *) table->extra1; - int *max = (int *) table->extra2; - size_t length; - int i; - - if (newlen % sizeof(int) != 0) - return -EINVAL; - - if (!table->extra1 && !table->extra2) - return 0; - - if (newlen > table->maxlen) - newlen = table->maxlen; - length = newlen / sizeof(int); - - for (i = 0; i < length; i++) { - int value; - if (get_user(value, vec + i)) - return -EFAULT; - if (min && value < min[i]) - return -EINVAL; - if (max && value > max[i]) - return -EINVAL; - } - } - return 0; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = *(int *)(table->data) / HZ; - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = new*HZ; - } - return 1; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = jiffies_to_msecs(*(int *)(table->data)); - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = msecs_to_jiffies(new); - } - return 1; -} - - - -#else /* CONFIG_SYSCTL_SYSCALL */ - - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - - /* If no error reading the parameters then just -ENOSYS ... */ - if (!error) - error = -ENOSYS; - - return error; -} - -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -#endif /* CONFIG_SYSCTL_SYSCALL */ - -static int deprecated_sysctl_warning(struct __sysctl_args *args) -{ - static int msg_count; - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (args->nlen < 0 || args->nlen > CTL_MAXNAME) - return -ENOTDIR; - - /* Read in the sysctl name for better debug message logging */ - for (i = 0; i < args->nlen; i++) - if (get_user(name[i], args->name + i)) - return -EFAULT; - - /* Ignore accesses to kernel.version */ - if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) - return 0; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < args->nlen; i++) - printk("%d.", name[i]); - printk("\n"); - } - return 0; -} - -/* - * No sense putting this after each symbol definition, twice, - * exception granted :-) - */ -EXPORT_SYMBOL(proc_dointvec); -EXPORT_SYMBOL(proc_dointvec_jiffies); -EXPORT_SYMBOL(proc_dointvec_minmax); -EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -EXPORT_SYMBOL(proc_dointvec_ms_jiffies); -EXPORT_SYMBOL(proc_dostring); -EXPORT_SYMBOL(proc_doulongvec_minmax); -EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(sysctl_intvec); -EXPORT_SYMBOL(sysctl_jiffies); -EXPORT_SYMBOL(sysctl_ms_jiffies); -EXPORT_SYMBOL(sysctl_string); -EXPORT_SYMBOL(sysctl_data); -EXPORT_SYMBOL(unregister_sysctl_table); -#include -#include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" -#include -#include -#include - -struct trans_ctl_table { - int ctl_name; - const char *procname; - const struct trans_ctl_table *child; -}; - -static const struct trans_ctl_table trans_random_table[] = { - { RANDOM_POOLSIZE, "poolsize" }, - { RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { RANDOM_BOOT_ID, "boot_id" }, - { RANDOM_UUID, "uuid" }, - {} -}; - -static const struct trans_ctl_table trans_pty_table[] = { - { PTY_MAX, "max" }, - { PTY_NR, "nr" }, - {} -}; - -static const struct trans_ctl_table trans_kern_table[] = { - { KERN_OSTYPE, "ostype" }, - { KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { KERN_NODENAME, "hostname" }, - { KERN_DOMAINNAME, "domainname" }, - - { KERN_PANIC, "panic" }, - { KERN_REALROOTDEV, "real-root-dev" }, - - { KERN_SPARC_REBOOT, "reboot-cmd" }, - { KERN_CTLALTDEL, "ctrl-alt-del" }, - { KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { KERN_MODPROBE, "modprobe" }, - { KERN_SG_BIG_BUFF, "sg-big-buff" }, - { KERN_ACCT, "acct" }, - { KERN_PPC_L2CR, "l2cr" }, - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { KERN_SHMMAX, "shmmax" }, - { KERN_MSGMAX, "msgmax" }, - { KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { KERN_SYSRQ, "sysrq" }, - { KERN_MAX_THREADS, "threads-max" }, - { KERN_RANDOM, "random", trans_random_table }, - { KERN_SHMALL, "shmall" }, - { KERN_MSGMNI, "msgmni" }, - { KERN_SEM, "sem" }, - { KERN_SPARC_STOP_A, "stop-a" }, - { KERN_SHMMNI, "shmmni" }, - - { KERN_OVERFLOWUID, "overflowuid" }, - { KERN_OVERFLOWGID, "overflowgid" }, - - { KERN_HOTPLUG, "hotplug", }, - { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { KERN_CORE_USES_PID, "core_uses_pid" }, - { KERN_TAINTED, "tainted" }, - { KERN_CADPID, "cad_pid" }, - { KERN_PIDMAX, "pid_max" }, - { KERN_CORE_PATTERN, "core_pattern" }, - { KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { KERN_HPPA_PWRSW, "soft-power" }, - { KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { KERN_PTY, "pty", trans_pty_table }, - { KERN_NGROUPS_MAX, "ngroups_max" }, - { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - { KERN_HZ_TIMER, "hz_timer" }, - { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { KERN_RANDOMIZE, "randomize_va_space" }, - - { KERN_SPIN_RETRY, "spin_retry" }, - { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" }, - { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { KERN_COMPAT_LOG, "compat-log" }, - { KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { KERN_NMI_WATCHDOG, "nmi_watchdog" }, - { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - {} -}; - -static const struct trans_ctl_table trans_vm_table[] = { - { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { VM_PAGE_CLUSTER, "page-cluster" }, - { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { VM_DIRTY_RATIO, "dirty_ratio" }, - { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, - { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" }, - { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, - { VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - { VM_HUGETLB_PAGES, "nr_hugepages" }, - { VM_SWAPPINESS, "swappiness" }, - { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { VM_MAX_MAP_COUNT, "max_map_count" }, - { VM_LAPTOP_MODE, "laptop_mode" }, - { VM_BLOCK_DUMP, "block_dump" }, - { VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { VM_DROP_PAGECACHE, "drop_caches" }, - { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { VM_PANIC_ON_OOM, "panic_on_oom" }, - { VM_VDSO_ENABLED, "vdso_enabled" }, - { VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct trans_ctl_table trans_net_core_table[] = { - { NET_CORE_WMEM_MAX, "wmem_max" }, - { NET_CORE_RMEM_MAX, "rmem_max" }, - { NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { NET_CORE_MSG_COST, "message_cost" }, - { NET_CORE_MSG_BURST, "message_burst" }, - { NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { NET_CORE_DEV_WEIGHT, "dev_weight" }, - { NET_CORE_SOMAXCONN, "somaxconn" }, - { NET_CORE_BUDGET, "netdev_budget" }, - { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct trans_ctl_table trans_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_route_table[] = { - { NET_IPV4_ROUTE_FLUSH, "flush" }, - { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, - { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, - { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { - { NET_IPV4_CONF_FORWARDING, "forwarding" }, - { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { NET_IPV4_CONF_TAG, "tag" }, - { NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - - { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, - { 0, NULL, trans_net_ipv4_conf_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_vars_table[] = { - { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { NET_NEIGH_APP_SOLICIT, "app_solicit" }, - { NET_NEIGH_RETRANS_TIME, "retrans_time" }, - { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" }, - { NET_NEIGH_PROXY_DELAY, "proxy_delay" }, - { NET_NEIGH_LOCKTIME, "locktime" }, - { NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_table[] = { - { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, - { 0, NULL, trans_net_neigh_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { - { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" }, - - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" }, - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" }, - { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" }, - { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" }, - - { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" }, - { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" }, - - { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_table[] = { - { NET_IPV4_FORWARD, "ip_forward" }, - { NET_IPV4_DYNADDR, "ip_dynaddr" }, - - { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table }, - { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, - - { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { NET_IPV4_TCP_SACK, "tcp_sack" }, - { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - /* NET_IPV4_IP_MASQ_DEBUG unused */ - { NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { NET_TCP_STDURG, "tcp_stdurg" }, - { NET_TCP_RFC1337, "tcp_rfc1337" }, - /* NET_TCP_SYN_TAILDROP unused */ - { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - /* NET_IPV4_ALWAYS_DEFRAG unused */ - { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { NET_TCP_FACK, "tcp_fack" }, - { NET_TCP_REORDERING, "tcp_reordering" }, - { NET_TCP_ECN, "tcp_ecn" }, - { NET_TCP_DSACK, "tcp_dsack" }, - { NET_TCP_MEM, "tcp_mem" }, - { NET_TCP_WMEM, "tcp_wmem" }, - { NET_TCP_RMEM, "tcp_rmem" }, - { NET_TCP_APP_WIN, "tcp_app_win" }, - { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - { NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { NET_TCP_FRTO, "tcp_frto" }, - { NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - /* NET_TCP_BIC_BETA unused */ - { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { NET_TCP_ABC, "tcp_abc" }, - { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" }, - { NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { NET_TCP_BASE_MSS, "tcp_base_mss" }, - { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, - { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" }, - { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipx_table[] = { - { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct trans_ctl_table trans_net_atalk_table[] = { - { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct trans_ctl_table trans_net_netrom_table[] = { - { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_param_table[] = { - { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { NET_AX25_CONNECT_MODE, "connect_mode" }, - { NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { NET_AX25_N2, "maximum_retry_count" }, - { NET_AX25_PACLEN, "maximum_packet_length" }, - { NET_AX25_PROTOCOL, "protocol" }, - { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_table[] = { - { 0, NULL, trans_net_ax25_param_table }, - {} -}; - -static const struct trans_ctl_table trans_net_bridge_table[] = { - { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, - { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, - { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, - { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" }, - { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" }, - {} -}; - -static const struct trans_ctl_table trans_net_rose_table[] = { - { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { NET_ROSE_WINDOW_SIZE, "window_size" }, - { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { - { NET_IPV6_FORWARDING, "forwarding" }, - { NET_IPV6_HOP_LIMIT, "hop_limit" }, - { NET_IPV6_MTU, "mtu" }, - { NET_IPV6_ACCEPT_RA, "accept_ra" }, - { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV6_AUTOCONF, "autoconf" }, - { NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, - { 0, NULL, trans_net_ipv6_conf_var_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_route_table[] = { - { NET_IPV6_ROUTE_FLUSH, "flush" }, - { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = { - { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_table[] = { - { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, - { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, - { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table }, - { NET_IPV6_BINDV6ONLY, "bindv6only" }, - { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_x25_table[] = { - { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct trans_ctl_table trans_net_tr_table[] = { - { NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct trans_ctl_table trans_net_decnet_conf_vars[] = { - { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { NET_DECNET_CONF_DEV_T2, "t2" }, - { NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_conf[] = { - { 0, NULL, trans_net_decnet_conf_vars }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_table[] = { - { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, - { NET_DECNET_NODE_ADDRESS, "node_address" }, - { NET_DECNET_NODE_NAME, "node_name" }, - { NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { NET_DECNET_TIME_WAIT, "time_wait" }, - { NET_DECNET_DN_COUNT, "dn_count" }, - { NET_DECNET_DI_COUNT, "di_count" }, - { NET_DECNET_DR_COUNT, "dr_count" }, - { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { NET_DECNET_MEM, "decnet_mem" }, - { NET_DECNET_RMEM, "decnet_rmem" }, - { NET_DECNET_WMEM, "decnet_wmem" }, - { NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct trans_ctl_table trans_net_sctp_table[] = { - { NET_SCTP_RTO_INITIAL, "rto_initial" }, - { NET_SCTP_RTO_MIN, "rto_min" }, - { NET_SCTP_RTO_MAX, "rto_max" }, - { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { NET_SCTP_HB_INTERVAL, "hb_interval" }, - { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { NET_SCTP_MAX_BURST, "max_burst" }, - { NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { - { NET_LLC2_ACK_TIMEOUT, "ack" }, - { NET_LLC2_P_TIMEOUT, "p" }, - { NET_LLC2_REJ_TIMEOUT, "rej" }, - { NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_station_table[] = { - { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_table[] = { - { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_table[] = { - { NET_LLC2, "llc2", trans_net_llc_llc2_table }, - { NET_LLC_STATION, "station", trans_net_llc_station_table }, - {} -}; - -static const struct trans_ctl_table trans_net_netfilter_table[] = { - { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" }, - { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" }, - { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" }, - { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" }, - { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" }, - { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct trans_ctl_table trans_net_dccp_table[] = { - { NET_DCCP_DEFAULT, "default" }, - {} -}; - -static const struct trans_ctl_table trans_net_irda_table[] = { - { NET_IRDA_DISCOVERY, "discovery" }, - { NET_IRDA_DEVNAME, "devname" }, - { NET_IRDA_DEBUG, "debug" }, - { NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - -static const struct trans_ctl_table trans_net_table[] = { - { NET_CORE, "core", trans_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { NET_UNIX, "unix", trans_net_unix_table }, - { NET_IPV4, "ipv4", trans_net_ipv4_table }, - { NET_IPX, "ipx", trans_net_ipx_table }, - { NET_ATALK, "appletalk", trans_net_atalk_table }, - { NET_NETROM, "netrom", trans_net_netrom_table }, - { NET_AX25, "ax25", trans_net_ax25_table }, - { NET_BRIDGE, "bridge", trans_net_bridge_table }, - { NET_ROSE, "rose", trans_net_rose_table }, - { NET_IPV6, "ipv6", trans_net_ipv6_table }, - { NET_X25, "x25", trans_net_x25_table }, - { NET_TR, "token-ring", trans_net_tr_table }, - { NET_DECNET, "decnet", trans_net_decnet_table }, - /* NET_ECONET not used */ - { NET_SCTP, "sctp", trans_net_sctp_table }, - { NET_LLC, "llc", trans_net_llc_table }, - { NET_NETFILTER, "netfilter", trans_net_netfilter_table }, - { NET_DCCP, "dccp", trans_net_dccp_table }, - { NET_IRDA, "irda", trans_net_irda_table }, - { 2089, "nf_conntrack_max" }, - {} -}; - -static const struct trans_ctl_table trans_fs_quota_table[] = { - { FS_DQ_LOOKUPS, "lookups" }, - { FS_DQ_DROPS, "drops" }, - { FS_DQ_READS, "reads" }, - { FS_DQ_WRITES, "writes" }, - { FS_DQ_CACHE_HITS, "cache_hits" }, - { FS_DQ_ALLOCATED, "allocated_dquots" }, - { FS_DQ_FREE, "free_dquots" }, - { FS_DQ_SYNCS, "syncs" }, - { FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct trans_ctl_table trans_fs_xfs_table[] = { - { XFS_RESTRICT_CHOWN, "restrict_chown" }, - { XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { XFS_PANIC_MASK, "panic_mask" }, - - { XFS_ERRLEVEL, "error_level" }, - { XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { XFS_INHERIT_SYNC, "inherit_sync" }, - { XFS_INHERIT_NODUMP, "inherit_nodump" }, - { XFS_INHERIT_NOATIME, "inherit_noatime" }, - { XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { XFS_BUF_AGE, "age_buffer_centisecs" }, - { XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { XFS_ROTORSTEP, "rotorstep" }, - { XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { - { 1, "hb_ctl_path" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_table[] = { - { 1, "nm", trans_fs_ocfs2_nm_table }, - {} -}; - -static const struct trans_ctl_table trans_inotify_table[] = { - { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct trans_ctl_table trans_fs_table[] = { - { FS_NRINODE, "inode-nr" }, - { FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - { FS_NRFILE, "file-nr" }, - { FS_MAXFILE, "file-max" }, - { FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { FS_OVERFLOWUID, "overflowuid" }, - { FS_OVERFLOWGID, "overflowgid" }, - { FS_LEASES, "leases-enable" }, - { FS_DIR_NOTIFY, "dir-notify-enable" }, - { FS_LEASE_TIME, "lease-break-time" }, - { FS_DQSTATS, "quota", trans_fs_quota_table }, - { FS_XFS, "xfs", trans_fs_xfs_table }, - { FS_AIO_NR, "aio-nr" }, - { FS_AIO_MAX_NR, "aio-max-nr" }, - { FS_INOTIFY, "inotify", trans_inotify_table }, - { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table }, - { KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct trans_ctl_table trans_debug_table[] = { - {} -}; - -static const struct trans_ctl_table trans_cdrom_table[] = { - { DEV_CDROM_INFO, "info" }, - { DEV_CDROM_AUTOCLOSE, "autoclose" }, - { DEV_CDROM_AUTOEJECT, "autoeject" }, - { DEV_CDROM_DEBUG, "debug" }, - { DEV_CDROM_LOCK, "lock" }, - { DEV_CDROM_CHECK_MEDIA, "check_media" }, - {} -}; - -static const struct trans_ctl_table trans_ipmi_table[] = { - { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct trans_ctl_table trans_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct trans_ctl_table trans_raid_table[] = { - { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct trans_ctl_table trans_scsi_table[] = { - { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct trans_ctl_table trans_parport_default_table[] = { - { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, - { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, - {} -}; - -static const struct trans_ctl_table trans_parport_device_table[] = { - { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, - {} -}; - -static const struct trans_ctl_table trans_parport_devices_table[] = { - { DEV_PARPORT_DEVICES_ACTIVE, "active" }, - { 0, NULL, trans_parport_device_table }, - {} -}; - -static const struct trans_ctl_table trans_parport_parport_table[] = { - { DEV_PARPORT_SPINTIME, "spintime" }, - { DEV_PARPORT_BASE_ADDR, "base-addr" }, - { DEV_PARPORT_IRQ, "irq" }, - { DEV_PARPORT_DMA, "dma" }, - { DEV_PARPORT_MODES, "modes" }, - { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table }, - { DEV_PARPORT_AUTOPROBE, "autoprobe" }, - { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" }, - { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" }, - { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" }, - { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, - {} -}; -static const struct trans_ctl_table trans_parport_table[] = { - { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, - { 0, NULL, trans_parport_parport_table }, - {} -}; - -static const struct trans_ctl_table trans_dev_table[] = { - { DEV_CDROM, "cdrom", trans_cdrom_table }, - /* DEV_HWMON unused */ - { DEV_PARPORT, "parport", trans_parport_table }, - { DEV_RAID, "raid", trans_raid_table }, - { DEV_MAC_HID, "mac_hid", trans_mac_hid_files }, - { DEV_SCSI, "scsi", trans_scsi_table }, - { DEV_IPMI, "ipmi", trans_ipmi_table }, - {} -}; - -static const struct trans_ctl_table trans_bus_isa_table[] = { - { BUS_ISA_MEM_BASE, "membase" }, - { BUS_ISA_PORT_BASE, "portbase" }, - { BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct trans_ctl_table trans_bus_table[] = { - { CTL_BUS_ISA, "isa", trans_bus_isa_table }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table0[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan0-txRing" }, - { 151, "arlan0-rxRing" }, - { 152, "arlan0-18" }, - { 153, "arlan0-ring" }, - { 154, "arlan0-shm-cpy" }, - { 155, "config0" }, - { 156, "reset0" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table1[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan1-txRing" }, - { 151, "arlan1-rxRing" }, - { 152, "arlan1-18" }, - { 153, "arlan1-ring" }, - { 154, "arlan1-shm-cpy" }, - { 155, "config1" }, - { 156, "reset1" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table2[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan2-txRing" }, - { 151, "arlan2-rxRing" }, - { 152, "arlan2-18" }, - { 153, "arlan2-ring" }, - { 154, "arlan2-shm-cpy" }, - { 155, "config2" }, - { 156, "reset2" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table3[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan3-txRing" }, - { 151, "arlan3-rxRing" }, - { 152, "arlan3-18" }, - { 153, "arlan3-ring" }, - { 154, "arlan3-shm-cpy" }, - { 155, "config3" }, - { 156, "reset3" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_table[] = { - { 1, "arlan0", trans_arlan_conf_table0 }, - { 2, "arlan1", trans_arlan_conf_table1 }, - { 3, "arlan2", trans_arlan_conf_table2 }, - { 4, "arlan3", trans_arlan_conf_table3 }, - {} -}; - -static const struct trans_ctl_table trans_s390dbf_table[] = { - { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct trans_ctl_table trans_sunrpc_table[] = { - { CTL_RPCDEBUG, "rpc_debug" }, - { CTL_NFSDEBUG, "nfs_debug" }, - { CTL_NFSDDEBUG, "nfsd_debug" }, - { CTL_NLMDEBUG, "nlm_debug" }, - { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct trans_ctl_table trans_pm_table[] = { - { 1 /* CTL_PM_SUSPEND */, "suspend" }, - { 2 /* CTL_PM_CMODE */, "cmode" }, - { 3 /* CTL_PM_P0 */, "p0" }, - { 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct trans_ctl_table trans_frv_table[] = { - { 1, "cache-mode" }, - { 2, "pin-cxnr" }, - {} -}; - -static const struct trans_ctl_table trans_root_table[] = { - { CTL_KERN, "kernel", trans_kern_table }, - { CTL_VM, "vm", trans_vm_table }, - { CTL_NET, "net", trans_net_table }, - /* CTL_PROC not used */ - { CTL_FS, "fs", trans_fs_table }, - { CTL_DEBUG, "debug", trans_debug_table }, - { CTL_DEV, "dev", trans_dev_table }, - { CTL_BUS, "bus", trans_bus_table }, - { CTL_ABI, "abi" }, - /* CTL_CPU not used */ - { CTL_ARLAN, "arlan", trans_arlan_table }, - { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, - { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, - { CTL_PM, "pm", trans_pm_table }, - { CTL_FRV, "frv", trans_frv_table }, - {} -}; - - - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - -static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) -{ - struct ctl_table *test; - const struct trans_ctl_table *ref; - int cur_depth; - - cur_depth = sysctl_depth(table); - - ref = trans_root_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname || ref->child; ref++) { - int match = 0; - - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - - if (!ref->ctl_name && !ref->procname) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - ref = NULL; -out: - return ref; -} - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); - if (table->ctl_name) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk(".%d", tmp->ctl_name); - } - } -} - -static void sysctl_repair_table(struct ctl_table *table) -{ - /* Don't complain about the classic default - * sysctl strategy routine. Maybe later we - * can get the tables fixed and complain about - * this. - */ - if (table->ctl_name && table->procname && - (table->proc_handler == proc_dointvec) && - (!table->strategy)) { - table->strategy = sysctl_data; - } -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static int sysctl_check_dir(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table *ref; - int error; - - error = 0; - ref = sysctl_check_lookup(namespaces, table); - if (ref) { - int match = 0; - if ((!table->procname && !ref->procname) || - (table->procname && ref->procname && - (strcmp(table->procname, ref->procname) == 0))) - match++; - - if ((!table->ctl_name && !ref->ctl_name) || - (table->ctl_name && ref->ctl_name && - (table->ctl_name == ref->ctl_name))) - match++; - - if (match != 2) { - printk(KERN_ERR "%s: failed: ", __func__); - sysctl_print_path(table); - printk(" ref: "); - sysctl_print_path(ref); - printk("\n"); - error = -EINVAL; - } - } - return error; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) -{ - const struct trans_ctl_table *ref; - - ref = sysctl_binary_lookup(table); - if (table->ctl_name && !ref) - set_fail(fail, table, "Unknown sysctl binary path"); - if (ref) { - if (ref->procname && - (!table->procname || - (strcmp(table->procname, ref->procname) != 0))) - set_fail(fail, table, "procname does not match binary path procname"); - - if (ref->ctl_name && table->ctl_name && - (table->ctl_name != ref->ctl_name)) - set_fail(fail, table, "ctl_name does not match binary path ctl_name"); - } -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->ctl_name || table->procname; table++) { - const char *fail = NULL; - - sysctl_repair_table(table); - if (table->parent) { - if (table->procname && !table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - if (table->ctl_name && !table->parent->ctl_name) - set_fail(&fail, table, "Parent without ctl_name"); - } - if (!table->procname) - set_fail(&fail, table, "No procname"); - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->strategy) - set_fail(&fail, table, "Directory with strategy"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - if (sysctl_check_dir(namespaces, table)) - set_fail(&fail, table, "Inconsistent directory names"); - } else { - if ((table->strategy == sysctl_data) || - (table->strategy == sysctl_string) || - (table->strategy == sysctl_intvec) || - (table->strategy == sysctl_jiffies) || - (table->strategy == sysctl_ms_jiffies) || - (table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } - if ((table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (table->maxlen > sizeof (unsigned long)) { - if (!table->extra1) - set_fail(&fail, table, "No min"); - if (!table->extra2) - set_fail(&fail, table, "No max"); - } - } -#ifdef CONFIG_SYSCTL_SYSCALL - if (table->ctl_name && !table->strategy) - set_fail(&fail, table, "Missing strategy"); -#endif -#if 0 - if (!table->ctl_name && table->strategy) - set_fail(&fail, table, "Strategy without ctl_name"); -#endif -#ifdef CONFIG_PROC_FS - if (table->procname && !table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - sysctl_check_bin_path(table, &fail); - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} -/* - * taskstats.c - Export per-task statistics to userland - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * (C) Balbir Singh, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Maximum length of a cpumask that can be specified in - * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute - */ -#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) - -static DEFINE_PER_CPU(__u32, taskstats_seqnum); -static int family_registered; -struct kmem_cache *taskstats_cache; - -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = TASKSTATS_GENL_NAME, - .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, -}; - -static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] -__read_mostly = { - [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, - [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, - [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, - [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; - -static struct nla_policy -cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { - [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, -}; - -struct listener { - struct list_head list; - pid_t pid; - char valid; -}; - -struct listener_list { - struct rw_semaphore sem; - struct list_head list; -}; -static DEFINE_PER_CPU(struct listener_list, listener_array); - -enum actions { - REGISTER, - DEREGISTER, - CPU_DONT_CARE -}; - -static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, - size_t size) -{ - struct sk_buff *skb; - void *reply; - - /* - * If new attributes are added, please revisit this allocation - */ - skb = genlmsg_new(size, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - if (!info) { - int seq = get_cpu_var(taskstats_seqnum)++; - put_cpu_var(taskstats_seqnum); - - reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); - } else - reply = genlmsg_put_reply(skb, info, &family, 0, cmd); - if (reply == NULL) { - nlmsg_free(skb); - return -EINVAL; - } - - *skbp = skb; - return 0; -} - -/* - * Send taskstats data in @skb to listener with nl_pid @pid - */ -static int send_reply(struct sk_buff *skb, pid_t pid) -{ - struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); - void *reply = genlmsg_data(genlhdr); - int rc; - - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return rc; - } - - return genlmsg_unicast(skb, pid); -} - -/* - * Send taskstats data in @skb to listeners registered for @cpu's exit data - */ -static void send_cpu_listeners(struct sk_buff *skb, - struct listener_list *listeners) -{ - struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); - struct listener *s, *tmp; - struct sk_buff *skb_next, *skb_cur = skb; - void *reply = genlmsg_data(genlhdr); - int rc, delcount = 0; - - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return; - } - - rc = 0; - down_read(&listeners->sem); - list_for_each_entry(s, &listeners->list, list) { - skb_next = NULL; - if (!list_is_last(&s->list, &listeners->list)) { - skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) - break; - } - rc = genlmsg_unicast(skb_cur, s->pid); - if (rc == -ECONNREFUSED) { - s->valid = 0; - delcount++; - } - skb_cur = skb_next; - } - up_read(&listeners->sem); - - if (skb_cur) - nlmsg_free(skb_cur); - - if (!delcount) - return; - - /* Delete invalidated entries */ - down_write(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { - if (!s->valid) { - list_del(&s->list); - kfree(s); - } - } - up_write(&listeners->sem); -} - -static int fill_pid(pid_t pid, struct task_struct *tsk, - struct taskstats *stats) -{ - int rc = 0; - - if (!tsk) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return -ESRCH; - } else - get_task_struct(tsk); - - memset(stats, 0, sizeof(*stats)); - /* - * Each accounting subsystem adds calls to its functions to - * fill in relevant parts of struct taskstsats as follows - * - * per-task-foo(stats, tsk); - */ - - delayacct_add_tsk(stats, tsk); - - /* fill in basic acct fields */ - stats->version = TASKSTATS_VERSION; - stats->nvcsw = tsk->nvcsw; - stats->nivcsw = tsk->nivcsw; - bacct_add_tsk(stats, tsk); - - /* fill in extended acct fields */ - xacct_add_tsk(stats, tsk); - - /* Define err: label here if needed */ - put_task_struct(tsk); - return rc; - -} - -static int fill_tgid(pid_t tgid, struct task_struct *first, - struct taskstats *stats) -{ - struct task_struct *tsk; - unsigned long flags; - int rc = -ESRCH; - - /* - * Add additional stats from live tasks except zombie thread group - * leaders who are already counted with the dead tasks - */ - rcu_read_lock(); - if (!first) - first = find_task_by_vpid(tgid); - - if (!first || !lock_task_sighand(first, &flags)) - goto out; - - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - else - memset(stats, 0, sizeof(*stats)); - - tsk = first; - do { - if (tsk->exit_state) - continue; - /* - * Accounting subsystem can call its functions here to - * fill in relevant parts of struct taskstsats as follows - * - * per-task-foo(stats, tsk); - */ - delayacct_add_tsk(stats, tsk); - - stats->nvcsw += tsk->nvcsw; - stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); - - unlock_task_sighand(first, &flags); - rc = 0; -out: - rcu_read_unlock(); - - stats->version = TASKSTATS_VERSION; - /* - * Accounting subsystems can also add calls here to modify - * fields of taskstats. - */ - return rc; -} - - -static void fill_tgid_exit(struct task_struct *tsk) -{ - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - if (!tsk->signal->stats) - goto ret; - - /* - * Each accounting subsystem calls its functions here to - * accumalate its per-task stats for tsk, into the per-tgid structure - * - * per-task-foo(tsk->signal->stats, tsk); - */ - delayacct_add_tsk(tsk->signal->stats, tsk); -ret: - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); - return; -} - -static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) -{ - struct listener_list *listeners; - struct listener *s, *tmp; - unsigned int cpu; - cpumask_t mask = *maskp; - - if (!cpus_subset(mask, cpu_possible_map)) - return -EINVAL; - - if (isadd == REGISTER) { - for_each_cpu_mask_nr(cpu, mask) { - s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, - cpu_to_node(cpu)); - if (!s) - goto cleanup; - s->pid = pid; - INIT_LIST_HEAD(&s->list); - s->valid = 1; - - listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); - list_add(&s->list, &listeners->list); - up_write(&listeners->sem); - } - return 0; - } - - /* Deregister or cleanup */ -cleanup: - for_each_cpu_mask_nr(cpu, mask) { - listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { - if (s->pid == pid) { - list_del(&s->list); - kfree(s); - break; - } - } - up_write(&listeners->sem); - } - return 0; -} - -static int parse(struct nlattr *na, cpumask_t *mask) -{ - char *data; - int len; - int ret; - - if (na == NULL) - return 1; - len = nla_len(na); - if (len > TASKSTATS_CPUMASK_MAXLEN) - return -E2BIG; - if (len < 1) - return -EINVAL; - data = kmalloc(len, GFP_KERNEL); - if (!data) - return -ENOMEM; - nla_strlcpy(data, na, len); - ret = cpulist_parse(data, *mask); - kfree(data); - return ret; -} - -static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) -{ - struct nlattr *na, *ret; - int aggr; - - aggr = (type == TASKSTATS_TYPE_PID) - ? TASKSTATS_TYPE_AGGR_PID - : TASKSTATS_TYPE_AGGR_TGID; - - na = nla_nest_start(skb, aggr); - if (!na) - goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) - goto err; - ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); - if (!ret) - goto err; - nla_nest_end(skb, na); - - return nla_data(ret); -err: - return NULL; -} - -static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) -{ - int rc = 0; - struct sk_buff *rep_skb; - struct cgroupstats *stats; - struct nlattr *na; - size_t size; - u32 fd; - struct file *file; - int fput_needed; - - na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; - if (!na) - return -EINVAL; - - fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - file = fget_light(fd, &fput_needed); - if (!file) - return 0; - - size = nla_total_size(sizeof(struct cgroupstats)); - - rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, - size); - if (rc < 0) - goto err; - - na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, - sizeof(struct cgroupstats)); - stats = nla_data(na); - memset(stats, 0, sizeof(*stats)); - - rc = cgroupstats_build(stats, file->f_dentry); - if (rc < 0) { - nlmsg_free(rep_skb); - goto err; - } - - rc = send_reply(rep_skb, info->snd_pid); - -err: - fput_light(file, fput_needed); - return rc; -} - -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) -{ - int rc = 0; - struct sk_buff *rep_skb; - struct taskstats *stats; - size_t size; - cpumask_t mask; - - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); - if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, REGISTER); - - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); - if (rc < 0) - return rc; - if (rc == 0) - return add_del_listener(info->snd_pid, &mask, DEREGISTER); - - /* - * Size includes space for nested attributes - */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - - rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); - if (rc < 0) - return rc; - - rc = -EINVAL; - if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { - u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); - if (!stats) - goto err; - - rc = fill_pid(pid, NULL, stats); - if (rc < 0) - goto err; - } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { - u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); - if (!stats) - goto err; - - rc = fill_tgid(tgid, NULL, stats); - if (rc < 0) - goto err; - } else - goto err; - - return send_reply(rep_skb, info->snd_pid); -err: - nlmsg_free(rep_skb); - return rc; -} - -static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct taskstats *stats; - - if (sig->stats || thread_group_empty(tsk)) - goto ret; - - /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); - - spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; - } - spin_unlock_irq(&tsk->sighand->siglock); - - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; -} - -/* Send pid data out on exit */ -void taskstats_exit(struct task_struct *tsk, int group_dead) -{ - int rc; - struct listener_list *listeners; - struct taskstats *stats; - struct sk_buff *rep_skb; - size_t size; - int is_thread_group; - - if (!family_registered) - return; - - /* - * Size includes space for nested attributes - */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - - is_thread_group = !!taskstats_tgid_alloc(tsk); - if (is_thread_group) { - /* PID + STATS + TGID + STATS */ - size = 2 * size; - /* fill the tsk->signal->stats structure */ - fill_tgid_exit(tsk); - } - - listeners = &__raw_get_cpu_var(listener_array); - if (list_empty(&listeners->list)) - return; - - rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); - if (rc < 0) - return; - - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); - if (!stats) - goto err; - - rc = fill_pid(-1, tsk, stats); - if (rc < 0) - goto err; - - /* - * Doesn't matter if tsk is the leader or the last group member leaving - */ - if (!is_thread_group || !group_dead) - goto send; - - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); - if (!stats) - goto err; - - memcpy(stats, tsk->signal->stats, sizeof(*stats)); - -send: - send_cpu_listeners(rep_skb, listeners); - return; -err: - nlmsg_free(rep_skb); -} - -static struct genl_ops taskstats_ops = { - .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, -}; - -static struct genl_ops cgroupstats_ops = { - .cmd = CGROUPSTATS_CMD_GET, - .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, -}; - -/* Needed early in initialization */ -void __init taskstats_init_early(void) -{ - unsigned int i; - - taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); - for_each_possible_cpu(i) { - INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); - init_rwsem(&(per_cpu(listener_array, i).sem)); - } -} - -static int __init taskstats_init(void) -{ - int rc; - - rc = genl_register_family(&family); - if (rc) - return rc; - - rc = genl_register_ops(&family, &taskstats_ops); - if (rc < 0) - goto err; - - rc = genl_register_ops(&family, &cgroupstats_ops); - if (rc < 0) - goto err_cgroup_ops; - - family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); - return 0; -err_cgroup_ops: - genl_unregister_ops(&family, &taskstats_ops); -err: - genl_unregister_family(&family); - return rc; -} - -/* - * late initcall ensures initialization of statistics collection - * mechanisms precedes initialization of the taskstats interface - */ -late_initcall(taskstats_init); -/* - * test_kprobes.c - simple sanity test for *probes - * - * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ - -#include -#include -#include - -#define div_factor 3 - -static u32 rand1, preh_val, posth_val, jph_val; -static int errors, handler_errors, num_tests; - -static noinline u32 kprobe_target(u32 value) -{ - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - - return (value / div_factor); -} - -static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - preh_val = (rand1 / div_factor); - return 0; -} - -static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - if (preh_val != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); - } - posth_val = preh_val + div_factor; -} - -static struct kprobe kp = { - .symbol_name = "kprobe_target", - .pre_handler = kp_pre_handler, - .post_handler = kp_post_handler -}; - -static int test_kprobe(void) -{ - int ret; - - ret = register_kprobe(&kp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_kprobe(&kp); - - if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); - handler_errors++; - } - - return 0; -} - -static u32 j_kprobe_target(u32 value) -{ - if (value != rand1) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -#ifdef CONFIG_KRETPROBES -static u32 krph_val; - -static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - krph_val = (rand1 / div_factor); - return 0; -} - -static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - if (ret != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); - } - if (krph_val == 0) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); - } - - krph_val = rand1; - return 0; -} - -static struct kretprobe rp = { - .handler = return_handler, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target" -}; - -static int test_kretprobe(void) -{ - int ret; - - ret = register_kretprobe(&rp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); - return ret; - } - - ret = kprobe_target(rand1); - unregister_kretprobe(&rp); - if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); - handler_errors++; - } - - return 0; -} -#endif /* CONFIG_KRETPROBES */ - -int init_test_probes(void) -{ - int ret; - - do { - rand1 = random32(); - } while (rand1 <= div_factor); - - printk(KERN_INFO "Kprobe smoke test started\n"); - num_tests++; - ret = test_kprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - -#ifdef CONFIG_KRETPROBES - num_tests++; - ret = test_kretprobe(); - if (ret < 0) - errors++; -#endif /* CONFIG_KRETPROBES */ - - if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); - else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); - else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); - - return 0; -} -/* - * linux/kernel/time/clockevents.c - * - * This file contains functions which manage clock event devices. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* The registered clock event devices */ -static LIST_HEAD(clockevent_devices); -static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - -/* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); - -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -unsigned long clockevent_delta2ns(unsigned long latch, - struct clock_event_device *evt) -{ - u64 clc = ((u64) latch << evt->shift); - - if (unlikely(!evt->mult)) { - evt->mult = 1; - WARN_ON(1); - } - - do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > LONG_MAX) - clc = LONG_MAX; - - return (unsigned long) clc; -} - -/** - * clockevents_set_mode - set the operating mode of a clock event device - * @dev: device to modify - * @mode: new mode - * - * Must be called with interrupts disabled ! - */ -void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) -{ - if (dev->mode != mode) { - dev->set_mode(mode, dev); - dev->mode = mode; - } -} - -/** - * clockevents_shutdown - shutdown the device and clear next_event - * @dev: device to shutdown - */ -void clockevents_shutdown(struct clock_event_device *dev) -{ - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - dev->next_event.tv64 = KTIME_MAX; -} - -/** - * clockevents_program_event - Reprogram the clock event device. - * @expires: absolute expiry time (monotonic clock) - * - * Returns 0 on success, -ETIME when the event is in the past. - */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - ktime_t now) -{ - unsigned long long clc; - int64_t delta; - - if (unlikely(expires.tv64 < 0)) { - WARN_ON_ONCE(1); - return -ETIME; - } - - delta = ktime_to_ns(ktime_sub(expires, now)); - - if (delta <= 0) - return -ETIME; - - dev->next_event = expires; - - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) - return 0; - - if (delta > dev->max_delta_ns) - delta = dev->max_delta_ns; - if (delta < dev->min_delta_ns) - delta = dev->min_delta_ns; - - clc = delta * dev->mult; - clc >>= dev->shift; - - return dev->set_next_event((unsigned long) clc, dev); -} - -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - -/* - * Called after a notify add to make devices available which were - * released from the notifier call. - */ -static void clockevents_notify_released(void) -{ - struct clock_event_device *dev; - - while (!list_empty(&clockevents_released)) { - dev = list_entry(clockevents_released.next, - struct clock_event_device, list); - list_del(&dev->list); - list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); - } -} - -/** - * clockevents_register_device - register a clock event device - * @dev: device to register - */ -void clockevents_register_device(struct clock_event_device *dev) -{ - unsigned long flags; - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - /* - * A nsec2cyc multiplicator of 0 is invalid and we'd crash - * on it, so fix it up and emit a warning: - */ - if (unlikely(!dev->mult)) { - dev->mult = 1; - WARN_ON(1); - } - - spin_lock_irqsave(&clockevents_lock, flags); - - list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); - clockevents_notify_released(); - - spin_unlock_irqrestore(&clockevents_lock, flags); -} - -/* - * Noop handler when we shut down an event device - */ -void clockevents_handle_noop(struct clock_event_device *dev) -{ -} - -/** - * clockevents_exchange_device - release and request clock devices - * @old: device to release (can be NULL) - * @new: device to request (can be NULL) - * - * Called from the notifier chain. clockevents_lock is held already - */ -void clockevents_exchange_device(struct clock_event_device *old, - struct clock_event_device *new) -{ - unsigned long flags; - - local_irq_save(flags); - /* - * Caller releases a clock event device. We queue it into the - * released list and do a notify add later. - */ - if (old) { - clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); - list_del(&old->list); - list_add(&old->list, &clockevents_released); - } - - if (new) { - BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_shutdown(new); - } - local_irq_restore(flags); -} - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -/** - * clockevents_notify - notification about relevant events - */ -void clockevents_notify(unsigned long reason, void *arg) -{ - struct list_head *node, *tmp; - unsigned long flags; - - spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); - - switch (reason) { - case CLOCK_EVT_NOTIFY_CPU_DEAD: - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_safe(node, tmp, &clockevents_released) - list_del(node); - break; - default: - break; - } - spin_unlock_irqrestore(&clockevents_lock, flags); -} -EXPORT_SYMBOL_GPL(clockevents_notify); -#endif -/* - * linux/kernel/time/clocksource.c - * - * This file contains the functions which manage clocksource drivers. - * - * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * TODO WishList: - * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern - */ - -#include -#include -#include -#include -#include /* for spin_unlock_irq() using preempt_count() m68k */ -#include - -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - -/*[Clocksource internal variables]--------- - * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. - * next_clocksource: - * pending next selected clocksource. - * clocksource_list: - * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list - * override_name: - * Name of the user-specified clocksource. - */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; -static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; -static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); -static char override_name[32]; -static int finished_booting; - -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); - -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG -static LIST_HEAD(watchdog_list); -static struct clocksource *watchdog; -static struct timer_list watchdog_timer; -static DEFINE_SPINLOCK(watchdog_lock); -static cycle_t watchdog_last; -static unsigned long watchdog_resumed; - -/* - * Interval: 0.5sec Threshold: 0.0625s - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) - -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) -{ - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; - - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); -} - -static void clocksource_watchdog(unsigned long data) -{ - struct clocksource *cs, *tmp; - cycle_t csnow, wdnow; - int64_t wd_nsec, cs_nsec; - int resumed; - - spin_lock(&watchdog_lock); - - resumed = test_and_clear_bit(0, &watchdog_resumed); - - wdnow = watchdog->read(); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); - watchdog_last = wdnow; - - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { - csnow = cs->read(); - - if (unlikely(resumed)) { - cs->wd_last = csnow; - continue; - } - - /* Initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); - } - } - - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); - - if (next_cpu >= nr_cpu_ids) - next_cpu = first_cpu(cpu_online_map); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); - } - spin_unlock(&watchdog_lock); -} -static void clocksource_resume_watchdog(void) -{ - set_bit(0, &watchdog_resumed); -} - -static void clocksource_check_watchdog(struct clocksource *cs) -{ - struct clocksource *cse; - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - - list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); - } - } else { - if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - - if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); - watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - - /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); - } - } - } - spin_unlock_irqrestore(&watchdog_lock, flags); -} -#else -static void clocksource_check_watchdog(struct clocksource *cs) -{ - if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; -} - -static inline void clocksource_resume_watchdog(void) { } -#endif - -/** - * clocksource_resume - resume the clocksource(s) - */ -void clocksource_resume(void) -{ - struct clocksource *cs; - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - - list_for_each_entry(cs, &clocksource_list, list) { - if (cs->resume) - cs->resume(); - } - - clocksource_resume_watchdog(); - - spin_unlock_irqrestore(&clocksource_lock, flags); -} - -/** - * clocksource_touch_watchdog - Update watchdog - * - * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * - */ -void clocksource_touch_watchdog(void) -{ - clocksource_resume_watchdog(); -} - -/** - * clocksource_get_next - Returns the selected clocksource - * - */ -struct clocksource *clocksource_get_next(void) -{ - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; - } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; -} - -/** - * select_clocksource - Selects the best registered clocksource. - * - * Private function. Must hold clocksource_lock when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static struct clocksource *select_clocksource(void) -{ - struct clocksource *next; - - if (list_empty(&clocksource_list)) - return NULL; - - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); - - if (next == curr_clocksource) - return NULL; - - return next; -} - -/* - * Enqueue the clocksource sorted by rating - */ -static int clocksource_enqueue(struct clocksource *c) -{ - struct list_head *tmp, *entry = &clocksource_list; - - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; - - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; - /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; -} - -/** - * clocksource_register - Used to install new clocksources - * @t: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *c) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; -} -EXPORT_SYMBOL(clocksource_register); - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); -} - -/** - * clocksource_unregister - remove a registered clocksource - */ -void clocksource_unregister(struct clocksource *cs) -{ - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); - list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); - spin_unlock_irqrestore(&clocksource_lock, flags); -} - -#ifdef CONFIG_SYSFS -/** - * sysfs_show_current_clocksources - sysfs interface for current clocksource - * @dev: unused - * @buf: char buffer to be filled with clocksource list - * - * Provides sysfs interface for listing current clocksource. - */ -static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - ssize_t count = 0; - - spin_lock_irq(&clocksource_lock); - count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); - - return count; -} - -/** - * sysfs_override_clocksource - interface for manually overriding clocksource - * @dev: unused - * @buf: name of override clocksource - * @count: length of buffer - * - * Takes input from sysfs interface for manually overriding the default - * clocksource selction. - */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t count) -{ - struct clocksource *ovr = NULL; - size_t ret = count; - int len; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - - spin_lock_irq(&clocksource_lock); - - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } - - spin_unlock_irq(&clocksource_lock); - - return ret; -} - -/** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource - * @dev: unused - * @buf: char buffer to be filled with clocksource list - * - * Provides sysfs interface for listing registered clocksources - */ -static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, - char *buf) -{ - struct clocksource *src; - ssize_t count = 0; - - spin_lock_irq(&clocksource_lock); - list_for_each_entry(src, &clocksource_list, list) { - count += snprintf(buf + count, - max((ssize_t)PAGE_SIZE - count, (ssize_t)0), - "%s ", src->name); - } - spin_unlock_irq(&clocksource_lock); - - count += snprintf(buf + count, - max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); - - return count; -} - -/* - * Sysfs setup bits: - */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static SYSDEV_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); - -static struct sysdev_class clocksource_sysclass = { - .name = "clocksource", -}; - -static struct sys_device device_clocksource = { - .id = 0, - .cls = &clocksource_sysclass, -}; - -static int __init init_clocksource_sysfs(void) -{ - int error = sysdev_class_register(&clocksource_sysclass); - - if (!error) - error = sysdev_register(&device_clocksource); - if (!error) - error = sysdev_create_file( - &device_clocksource, - &attr_current_clocksource); - if (!error) - error = sysdev_create_file( - &device_clocksource, - &attr_available_clocksource); - return error; -} - -device_initcall(init_clocksource_sysfs); -#endif /* CONFIG_SYSFS */ - -/** - * boot_override_clocksource - boot clock override - * @str: override name - * - * Takes a clocksource= boot argument and uses it - * as the clocksource override name. - */ -static int __init boot_override_clocksource(char* str) -{ - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); - if (str) - strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); - return 1; -} - -__setup("clocksource=", boot_override_clocksource); - -/** - * boot_override_clock - Compatibility layer for deprecated boot option - * @str: override name - * - * DEPRECATED! Takes a clock= boot argument and uses it - * as the clocksource override name - */ -static int __init boot_override_clock(char* str) -{ - if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); - return boot_override_clocksource("acpi_pm"); - } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); - return boot_override_clocksource(str); -} - -__setup("clock=", boot_override_clock); -/*********************************************************************** -* linux/kernel/time/jiffies.c -* -* This file contains the jiffies based clocksource. -* -* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -* -************************************************************************/ -#include -#include -#include - -/* The Jiffies based clocksource is the lowest common - * denominator clock source which should function on - * all systems. It has the same coarse resolution as - * the timer interrupt frequency HZ and it suffers - * inaccuracies caused by missed or lost timer - * interrupts and the inability for the timer - * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended - * for "tick-less" systems. - */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) - -/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater then 8 overflow 32bits when - * HZ=100. - */ -#define JIFFIES_SHIFT 8 - -static cycle_t jiffies_read(void) -{ - return (cycle_t) jiffies; -} - -struct clocksource clocksource_jiffies = { - .name = "jiffies", - .rating = 1, /* lowest valid rating*/ - .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ - .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, -}; - -static int __init init_jiffies_clocksource(void) -{ - return clocksource_register(&clocksource_jiffies); -} - -core_initcall(init_jiffies_clocksource); -/* - * linux/kernel/time/ntp.c - * - * NTP state machine interfaces and logic. - * - * This code was mainly moved from kernel/timer.c and kernel/time.c - * Please see those files for relevant copyright info and historical - * changelogs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Timekeeping variables - */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec; /* ACTHZ period (nsec) */ -u64 tick_length; -static u64 tick_length_base; - -static struct hrtimer leap_timer; - -#define MAX_TICKADJ 500 /* microsecs */ -#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -static int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -static long time_tai; /* TAI offset (s) */ -static s64 time_offset; /* time adjustment (ns) */ -static long time_constant = 2; /* pll time constant */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static s64 time_freq; /* frequency offset (scaled ns/s)*/ -static long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -static long ntp_tick_adj; - -static void ntp_update_frequency(void) -{ - u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; - second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; - second_length += time_freq; - - tick_length_base = second_length; - - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); -} - -static void ntp_update_offset(long offset) -{ - long mtemp; - s64 freq_adj; - - if (!(time_status & STA_PLL)) - return; - - if (!(time_status & STA_NANO)) - offset *= NSEC_PER_USEC; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - offset = min(offset, MAXPHASE); - offset = max(offset, -MAXPHASE); - - /* - * Select how the frequency is to be controlled - * and in which mode (PLL or FLL). - */ - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - - freq_adj = (s64)offset * mtemp; - freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); - time_status &= ~STA_MODE; - if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { - freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), - mtemp); - time_status |= STA_MODE; - } - freq_adj += time_freq; - freq_adj = min(freq_adj, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); - - time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); -} - -/** - * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock - */ -void ntp_clear(void) -{ - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - - ntp_update_frequency(); - - tick_length = tick_length_base; - time_offset = 0; -} - -/* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. - */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) -{ - enum hrtimer_restart res = HRTIMER_NORESTART; - - write_seqlock_irq(&xtime_lock); - - switch (time_state) { - case TIME_OK: - break; - case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: " - "inserting leap second 23:59:60 UTC\n"); - leap_timer.expires = ktime_add_ns(leap_timer.expires, - NSEC_PER_SEC); - res = HRTIMER_RESTART; - break; - case TIME_DEL: - xtime.tv_sec++; - time_tai--; - wall_to_monotonic.tv_sec--; - time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: " - "deleting leap second 23:59:59 UTC\n"); - break; - case TIME_OOP: - time_tai++; - time_state = TIME_WAIT; - /* fall through */ - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - } - update_vsyscall(&xtime, clock); - - write_sequnlock_irq(&xtime_lock); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 time_adj; - - /* Bump the maxerror field */ - time_maxerror += MAXFREQ / NSEC_PER_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Compute the phase adjustment for the next second. The offset is - * reduced by a fixed factor times the time constant. - */ - tick_length = tick_length_base; - time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); - time_offset -= time_adj; - tick_length += time_adj; - - if (unlikely(time_adjust)) { - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; - } else if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; - } else { - tick_length += (s64)(time_adjust * NSEC_PER_USEC / - NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; - time_adjust = 0; - } - } -} - -#ifdef CONFIG_GENERIC_CMOS_UPDATE - -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; - -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timespec now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) - fail = update_persistent_clock(now); - - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); -} - -static void notify_cmos_timer(void) -{ - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); -} - -#else -static inline void notify_cmos_timer(void) { } -#endif - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - struct timespec ts; - long save_adjust, sec; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { - /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) - return -EINVAL; - } - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); - getnstimeofday(&ts); - - write_seqlock_irq(&xtime_lock); - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - - /* If there are input parameters, then process them */ - if (txc->modes) { - if (txc->modes & ADJ_STATUS) { - if ((time_status & STA_PLL) && - !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - } - /* only set allowed bits */ - time_status &= STA_RONLY; - time_status |= txc->status & ~STA_RONLY; - - switch (time_state) { - case TIME_OK: - start_timer: - sec = ts.tv_sec; - if (time_status & STA_INS) { - time_state = TIME_INS; - sec += 86400 - sec % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; - sec += 86400 - (sec + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - goto start_timer; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } - } - - if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; - if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; - - if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - } - - if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; - if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; - - if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); - } - - if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; - - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } - if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; - - if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); - } - - result = time_state; /* mostly `TIME_OK' */ - if (time_status & (STA_UNSYNC|STA_CLOCKERR)) - result = TIME_ERROR; - - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = 1; - txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; - txc->tick = tick_usec; - txc->tai = time_tai; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; - if (!(time_status & STA_NANO)) - txc->time.tv_usec /= NSEC_PER_USEC; - - notify_cmos_timer(); - - return result; -} - -static int __init ntp_tick_adj_setup(char *str) -{ - ntp_tick_adj = simple_strtol(str, NULL, 0); - return 1; -} - -__setup("ntp_tick_adj=", ntp_tick_adj_setup); - -void __init ntp_init(void) -{ - ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; -} -/* - * linux/kernel/time/tick-broadcast.c - * - * This file contains functions which emulate a local clock-event - * device via a broadcast event source. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/* - * Broadcast support for broken x86 hardware, where the local apic - * timer stops in C3 state. - */ - -struct tick_device tick_broadcast_device; -static cpumask_t tick_broadcast_mask; -static DEFINE_SPINLOCK(tick_broadcast_lock); -static int tick_broadcast_force; - -#ifdef CONFIG_TICK_ONESHOT -static void tick_broadcast_clear_oneshot(int cpu); -#else -static inline void tick_broadcast_clear_oneshot(int cpu) { } -#endif - -/* - * Debugging: see timer_list.c - */ -struct tick_device *tick_get_broadcast_device(void) -{ - return &tick_broadcast_device; -} - -cpumask_t *tick_get_broadcast_mask(void) -{ - return &tick_broadcast_mask; -} - -/* - * Start the device in periodic mode - */ -static void tick_broadcast_start_periodic(struct clock_event_device *bc) -{ - if (bc) - tick_setup_periodic(bc, 1); -} - -/* - * Check, if the device can be utilized as broadcast device: - */ -int tick_check_broadcast_device(struct clock_event_device *dev) -{ - if ((tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; - - clockevents_exchange_device(NULL, dev); - tick_broadcast_device.evtdev = dev; - if (!cpus_empty(tick_broadcast_mask)) - tick_broadcast_start_periodic(dev); - return 1; -} - -/* - * Check, if the device is the broadcast device - */ -int tick_is_broadcast_device(struct clock_event_device *dev) -{ - return (dev && tick_broadcast_device.evtdev == dev); -} - -/* - * Check, if the device is disfunctional and a place holder, which - * needs to be handled by the broadcast device. - */ -int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) -{ - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - /* - * Devices might be registered with both periodic and oneshot - * mode disabled. This signals, that the device needs to be - * operated from the broadcast device and is a placeholder for - * the cpu local device. - */ - if (!tick_device_is_functional(dev)) { - dev->event_handler = tick_handle_periodic; - cpu_set(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); - ret = 1; - } else { - /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. - */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); - - cpu_clear(cpu, tick_broadcast_mask); - tick_broadcast_clear_oneshot(cpu); - } - } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); - return ret; -} - -/* - * Broadcast the event to the cpus, which are set in the mask - */ -static void tick_do_broadcast(cpumask_t mask) -{ - int cpu = smp_processor_id(); - struct tick_device *td; - - /* - * Check, if the current cpu is in the mask - */ - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); - } - - if (!cpus_empty(mask)) { - /* - * It might be necessary to actually check whether the devices - * have different broadcast functions. For now, just use the - * one of the first device. This works as long as we have this - * misfeature only on x86 (lapic) - */ - cpu = first_cpu(mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->broadcast(mask); - } -} - -/* - * Periodic broadcast: - * - invoke the broadcast handlers - */ -static void tick_do_periodic_broadcast(void) -{ - cpumask_t mask; - - spin_lock(&tick_broadcast_lock); - - cpus_and(mask, cpu_online_map, tick_broadcast_mask); - tick_do_broadcast(mask); - - spin_unlock(&tick_broadcast_lock); -} - -/* - * Event handler for periodic broadcast ticks - */ -static void tick_handle_periodic_broadcast(struct clock_event_device *dev) -{ - ktime_t next; - - tick_do_periodic_broadcast(); - - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; - - /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. - */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); - - if (!clockevents_program_event(dev, next, ktime_get())) - return; - tick_do_periodic_broadcast(); - } -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop - */ -static void tick_do_broadcast_on_off(unsigned long *reason) -{ - struct clock_event_device *bc, *dev; - struct tick_device *td; - unsigned long flags; - int cpu, bc_stopped; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - bc = tick_broadcast_device.evtdev; - - /* - * Is the device not affected by the powerstate ? - */ - if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; - - if (!tick_device_is_functional(dev)) - goto out; - - bc_stopped = cpus_empty(tick_broadcast_mask); - - switch (*reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpu_isset(cpu, tick_broadcast_mask)) { - cpu_set(cpu, tick_broadcast_mask); - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) - clockevents_shutdown(dev); - } - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - tick_broadcast_force = 1; - break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpu_isset(cpu, tick_broadcast_mask)) { - cpu_clear(cpu, tick_broadcast_mask); - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) - tick_setup_periodic(dev, 0); - } - break; - } - - if (cpus_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); - } -out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop. - */ -void tick_broadcast_on_off(unsigned long reason, int *oncpu) -{ - if (!cpu_isset(*oncpu, cpu_online_map)) - printk(KERN_ERR "tick-broadcast: ignoring broadcast for " - "offline CPU #%d\n", *oncpu); - else - tick_do_broadcast_on_off(&reason); -} - -/* - * Set the periodic handler depending on broadcast on/off - */ -void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) -{ - if (!broadcast) - dev->event_handler = tick_handle_periodic; - else - dev->event_handler = tick_handle_periodic_broadcast; -} - -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int *cpup) -{ - struct clock_event_device *bc; - unsigned long flags; - unsigned int cpu = *cpup; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpu_clear(cpu, tick_broadcast_mask); - - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_shutdown(bc); - } - - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -void tick_suspend_broadcast(void) -{ - struct clock_event_device *bc; - unsigned long flags; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - if (bc) - clockevents_shutdown(bc); - - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -int tick_resume_broadcast(void) -{ - struct clock_event_device *bc; - unsigned long flags; - int broadcast = 0; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - - if (bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); - - switch (tick_broadcast_device.mode) { - case TICKDEV_MODE_PERIODIC: - if(!cpus_empty(tick_broadcast_mask)) - tick_broadcast_start_periodic(bc); - broadcast = cpu_isset(smp_processor_id(), - tick_broadcast_mask); - break; - case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); - break; - } - } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); - - return broadcast; -} - - -#ifdef CONFIG_TICK_ONESHOT - -static cpumask_t tick_broadcast_oneshot_mask; - -/* - * Debugging: see timer_list.c - */ -cpumask_t *tick_get_broadcast_oneshot_mask(void) -{ - return &tick_broadcast_oneshot_mask; -} - -static int tick_broadcast_set_event(ktime_t expires, int force) -{ - struct clock_event_device *bc = tick_broadcast_device.evtdev; - - return tick_dev_program_event(bc, expires, force); -} - -int tick_resume_broadcast_oneshot(struct clock_event_device *bc) -{ - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return 0; -} - -/* - * Handle oneshot mode broadcasting - */ -static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) -{ - struct tick_device *td; - cpumask_t mask; - ktime_t now, next_event; - int cpu; - - spin_lock(&tick_broadcast_lock); -again: - dev->next_event.tv64 = KTIME_MAX; - next_event.tv64 = KTIME_MAX; - mask = CPU_MASK_NONE; - now = ktime_get(); - /* Find all expired events */ - for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { - td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpu_set(cpu, mask); - else if (td->evtdev->next_event.tv64 < next_event.tv64) - next_event.tv64 = td->evtdev->next_event.tv64; - } - - /* - * Wakeup the cpus which have an expired event. - */ - tick_do_broadcast(mask); - - /* - * Two reasons for reprogram: - * - * - The global event did not expire any CPU local - * events. This happens in dyntick mode, as the maximum PIT - * delta is quite small. - * - * - There are pending events on sleeping CPUs which were not - * in the event mask - */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(next_event, 0)) - goto again; - } - spin_unlock(&tick_broadcast_lock); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop - */ -void tick_broadcast_oneshot_control(unsigned long reason) -{ - struct clock_event_device *bc, *dev; - struct tick_device *td; - unsigned long flags; - int cpu; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - /* - * Periodic mode does not care about the enter/exit of power - * states - */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - goto out; - - bc = tick_broadcast_device.evtdev; - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; - - if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_set(cpu, tick_broadcast_oneshot_mask); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); - } - } else { - if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { - cpu_clear(cpu, tick_broadcast_oneshot_mask); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); - } - } - -out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Reset the one shot broadcast for a cpu - * - * Called with tick_broadcast_lock held - */ -static void tick_broadcast_clear_oneshot(int cpu) -{ - cpu_clear(cpu, tick_broadcast_oneshot_mask); -} - -static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) -{ - struct tick_device *td; - int cpu; - - for_each_cpu_mask_nr(cpu, *mask) { - td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev) - td->evtdev->next_event = expires; - } -} - -/** - * tick_broadcast_setup_oneshot - setup the broadcast device - */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - /* Set it up only once ! */ - if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - int cpu = smp_processor_id(); - cpumask_t mask; - - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - /* Take the do_timer update */ - tick_do_timer_cpu = cpu; - - /* - * We must be careful here. There might be other CPUs - * waiting for periodic broadcast. We need to set the - * oneshot_mask bits for those and program the - * broadcast device to fire. - */ - mask = tick_broadcast_mask; - cpu_clear(cpu, mask); - cpus_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, mask); - - if (was_periodic && !cpus_empty(mask)) { - tick_broadcast_init_next_event(&mask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); - } else - bc->next_event.tv64 = KTIME_MAX; - } -} - -/* - * Select oneshot operating mode for the broadcast device - */ -void tick_broadcast_switch_to_oneshot(void) -{ - struct clock_event_device *bc; - unsigned long flags; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; - bc = tick_broadcast_device.evtdev; - if (bc) - tick_broadcast_setup_oneshot(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - - -/* - * Remove a dead CPU from broadcasting - */ -void tick_shutdown_broadcast_oneshot(unsigned int *cpup) -{ - unsigned long flags; - unsigned int cpu = *cpup; - - spin_lock_irqsave(&tick_broadcast_lock, flags); - - /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! - */ - cpu_clear(cpu, tick_broadcast_oneshot_mask); - - spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Check, whether the broadcast device is in one shot mode - */ -int tick_broadcast_oneshot_active(void) -{ - return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; -} - -#endif -/* - * linux/kernel/time/tick-common.c - * - * This file contains the base functions to manage periodic tick - * related events. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "tick-internal.h" - -/* - * Tick devices - */ -DEFINE_PER_CPU(struct tick_device, tick_cpu_device); -/* - * Tick next event: keeps track of the tick time - */ -ktime_t tick_next_period; -ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -DEFINE_SPINLOCK(tick_device_lock); - -/* - * Debugging: see timer_list.c - */ -struct tick_device *tick_get_device(int cpu) -{ - return &per_cpu(tick_cpu_device, cpu); -} - -/** - * tick_is_oneshot_available - check for a oneshot capable event device - */ -int tick_is_oneshot_available(void) -{ - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - - return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); -} - -/* - * Periodic tick - */ -static void tick_periodic(int cpu) -{ - if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); - - /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, tick_period); - - do_timer(1); - write_sequnlock(&xtime_lock); - } - - update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); -} - -/* - * Event handler for periodic ticks - */ -void tick_handle_periodic(struct clock_event_device *dev) -{ - int cpu = smp_processor_id(); - ktime_t next; - - tick_periodic(cpu); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); - for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) - return; - /* - * Have to be careful here. If we're in oneshot mode, - * before we call tick_periodic() in a loop, we need - * to be sure we're using a real hardware clocksource. - * Otherwise we could get trapped in an infinite - * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing - * the loop to trigger again and again. - */ - if (timekeeping_valid_for_hres()) - tick_periodic(cpu); - next = ktime_add(next, tick_period); - } -} - -/* - * Setup the device for a periodic tick - */ -void tick_setup_periodic(struct clock_event_device *dev, int broadcast) -{ - tick_set_periodic_handler(dev, broadcast); - - /* Broadcast setup ? */ - if (!tick_device_is_functional(dev)) - return; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !tick_broadcast_oneshot_active()) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); - } else { - unsigned long seq; - ktime_t next; - - do { - seq = read_seqbegin(&xtime_lock); - next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); - - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - - for (;;) { - if (!clockevents_program_event(dev, next, ktime_get())) - return; - next = ktime_add(next, tick_period); - } - } -} - -/* - * Setup the tick device - */ -static void tick_setup_device(struct tick_device *td, - struct clock_event_device *newdev, int cpu, - const cpumask_t *cpumask) -{ - ktime_t next_event; - void (*handler)(struct clock_event_device *) = NULL; - - /* - * First device setup ? - */ - if (!td->evtdev) { - /* - * If no cpu took the do_timer update, assign it to - * this cpu: - */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; - tick_next_period = ktime_get(); - tick_period = ktime_set(0, NSEC_PER_SEC / HZ); - } - - /* - * Startup in periodic mode first. - */ - td->mode = TICKDEV_MODE_PERIODIC; - } else { - handler = td->evtdev->event_handler; - next_event = td->evtdev->next_event; - td->evtdev->event_handler = clockevents_handle_noop; - } - - td->evtdev = newdev; - - /* - * When the device is not per cpu, pin the interrupt to the - * current cpu: - */ - if (!cpus_equal(newdev->cpumask, *cpumask)) - irq_set_affinity(newdev->irq, *cpumask); - - /* - * When global broadcasting is active, check if the current - * device is registered as a placeholder for broadcast mode. - * This allows us to handle this x86 misfeature in a generic - * way. - */ - if (tick_device_uses_broadcast(newdev, cpu)) - return; - - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(newdev, 0); - else - tick_setup_oneshot(newdev, handler, next_event); -} - -/* - * Check, if the new registered device should be used. - */ -static int tick_check_new_device(struct clock_event_device *newdev) -{ - struct clock_event_device *curdev; - struct tick_device *td; - int cpu, ret = NOTIFY_OK; - unsigned long flags; - - spin_lock_irqsave(&tick_device_lock, flags); - - cpu = smp_processor_id(); - if (!cpu_isset(cpu, newdev->cpumask)) - goto out_bc; - - td = &per_cpu(tick_cpu_device, cpu); - curdev = td->evtdev; - - /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) - goto out_bc; - } - - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } - - /* - * Replace the eventually existing device by the new - * device. If the current device is the broadcast device, do - * not give it back to the clockevents layer ! - */ - if (tick_is_broadcast_device(curdev)) { - clockevents_shutdown(curdev); - curdev = NULL; - } - clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); - if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) - tick_oneshot_notify(); - - spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; - -out_bc: - /* - * Can the new device be used as a broadcast device ? - */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - - spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; -} - -/* - * Shutdown an event device on a given cpu: - * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. - */ -static void tick_shutdown(unsigned int *cpup) -{ - struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); - struct clock_event_device *dev = td->evtdev; - unsigned long flags; - - spin_lock_irqsave(&tick_device_lock, flags); - td->mode = TICKDEV_MODE_PERIODIC; - if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - dev->mode = CLOCK_EVT_MODE_UNUSED; - clockevents_exchange_device(dev, NULL); - td->evtdev = NULL; - } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); - - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : - TICK_DO_TIMER_NONE; - } - spin_unlock_irqrestore(&tick_device_lock, flags); -} - -static void tick_suspend(void) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - - spin_lock_irqsave(&tick_device_lock, flags); - clockevents_shutdown(td->evtdev); - spin_unlock_irqrestore(&tick_device_lock, flags); -} - -static void tick_resume(void) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - int broadcast = tick_resume_broadcast(); - - spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); - - if (!broadcast) { - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); - } - spin_unlock_irqrestore(&tick_device_lock, flags); -} - -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - -/** - * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework - */ -void __init tick_init(void) -{ - clockevents_register_notifier(&tick_notifier); -} -/* - * linux/kernel/time/tick-oneshot.c - * - * This file contains functions which manage high resolution tick - * related events. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/** - * tick_program_event internal worker function - */ -int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, - int force) -{ - ktime_t now = ktime_get(); - int i; - - for (i = 0;;) { - int ret = clockevents_program_event(dev, expires, now); - - if (!ret || !force) - return ret; - - /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it - * and emit a warning. - */ - if (++i > 2) { - /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %lu nsec\n", - dev->name ? dev->name : "?", - dev->min_delta_ns << 1); - - i = 0; - } - - now = ktime_get(); - expires = ktime_add_ns(now, dev->min_delta_ns); - } -} - -/** - * tick_program_event - */ -int tick_program_event(ktime_t expires, int force) -{ - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - - return tick_dev_program_event(dev, expires, force); -} - -/** - * tick_resume_onshot - resume oneshot mode - */ -void tick_resume_oneshot(void) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; - - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_program_event(ktime_get(), 1); -} - -/** - * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) - */ -void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t next_event) -{ - newdev->event_handler = handler; - clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - tick_dev_program_event(newdev, next_event, 1); -} - -/** - * tick_switch_to_oneshot - switch to oneshot mode - */ -int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; - - if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) { - - printk(KERN_INFO "Clockevents: " - "could not switch to one-shot mode:"); - if (!dev) { - printk(" no tick device\n"); - } else { - if (!tick_device_is_functional(dev)) - printk(" %s is not functional.\n", dev->name); - else - printk(" %s does not support one-shot mode.\n", - dev->name); - } - return -EINVAL; - } - - td->mode = TICKDEV_MODE_ONESHOT; - dev->event_handler = handler; - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_switch_to_oneshot(); - return 0; -} - -#ifdef CONFIG_HIGH_RES_TIMERS -/** - * tick_init_highres - switch to high resolution mode - * - * Called with interrupts disabled. - */ -int tick_init_highres(void) -{ - return tick_switch_to_oneshot(hrtimer_interrupt); -} -#endif -/* - * linux/kernel/time/tick-sched.c - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner - * - * No idle tick implementation for low and high resolution timers - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * Distribute under GPLv2. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "tick-internal.h" - -/* - * Per cpu nohz control structure - */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); - -/* - * The time, when the last jiffy update happened. Protected by xtime_lock. - */ -static ktime_t last_jiffies_update; - -struct tick_sched *tick_get_tick_sched(int cpu) -{ - return &per_cpu(tick_cpu_sched, cpu); -} - -/* - * Must be called with interrupts disabled ! - */ -static void tick_do_update_jiffies64(ktime_t now) -{ - unsigned long ticks = 0; - ktime_t delta; - - /* - * Do a quick check without holding xtime_lock: - */ - delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 < tick_period.tv64) - return; - - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); - - delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 >= tick_period.tv64) { - - delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); - - /* Slow path for long timeouts */ - if (unlikely(delta.tv64 >= tick_period.tv64)) { - s64 incr = ktime_to_ns(tick_period); - - ticks = ktime_divns(delta, incr); - - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); - } - do_timer(++ticks); - - /* Keep the tick_next_period variable up to date */ - tick_next_period = ktime_add(last_jiffies_update, tick_period); - } - write_sequnlock(&xtime_lock); -} - -/* - * Initialize and return retrieve the jiffies update. - */ -static ktime_t tick_init_jiffy_update(void) -{ - ktime_t period; - - write_seqlock(&xtime_lock); - /* Did we start the jiffies update yet ? */ - if (last_jiffies_update.tv64 == 0) - last_jiffies_update = tick_next_period; - period = last_jiffies_update; - write_sequnlock(&xtime_lock); - return period; -} - -/* - * NOHZ - aka dynamic tick functionality - */ -#ifdef CONFIG_NO_HZ -/* - * NO HZ enabled ? - */ -static int tick_nohz_enabled __read_mostly = 1; - -/* - * Enable / Disable tickless mode - */ -static int __init setup_tick_nohz(char *str) -{ - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; -} - -__setup("nohz=", setup_tick_nohz); - -/** - * tick_nohz_update_jiffies - update jiffies when idle was interrupted - * - * Called from interrupt entry when the CPU was idle - * - * In case the sched_tick was stopped on this CPU, we have to check if jiffies - * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. - */ -void tick_nohz_update_jiffies(void) -{ - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - unsigned long flags; - ktime_t now; - - if (!ts->tick_stopped) - return; - - cpu_clear(cpu, nohz_cpu_mask); - now = ktime_get(); - ts->idle_waketime = now; - - local_irq_save(flags); - tick_do_update_jiffies64(now); - local_irq_restore(flags); - - touch_softlockup_watchdog(); -} - -void tick_nohz_stop_idle(int cpu) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - if (ts->idle_active) { - ktime_t now, delta; - now = ktime_get(); - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_active = 0; - - sched_clock_idle_wakeup_event(0); - } -} - -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) -{ - ktime_t now, delta; - - now = ktime_get(); - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - } - ts->idle_entrytime = now; - ts->idle_active = 1; - sched_clock_idle_sleep_event(); - return now; -} - -u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - *last_update_time = ktime_to_us(ts->idle_lastupdate); - return ktime_to_us(ts->idle_sleeptime); -} - -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) -{ - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; - ktime_t last_update, expires, now; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - int cpu; - - local_irq_save(flags); - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - now = tick_nohz_start_idle(ts); - - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; - - if (!inidle && !ts->inidle) - goto end; - - ts->inidle = 1; - - if (need_resched()) - goto end; - - if (unlikely(local_softirq_pending())) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); - ratelimit++; - } - goto end; - } - - ts->idle_calls++; - /* Read jiffies and the time when jiffies were updated last */ - do { - seq = read_seqbegin(&xtime_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; - } while (read_seqretry(&xtime_lock, seq)); - - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) - delta_jiffies = 1; - /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu - */ - if (!ts->tick_stopped && delta_jiffies == 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpu_clear(cpu, nohz_cpu_mask); - goto out; - } - - ts->idle_tick = ts->sched_timer.expires; - ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); - } - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - - ts->idle_sleeps++; - - /* - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremly far - * into the future (12 days for HZ=1000). In this case - * we simply stop the tick timer: - */ - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { - ts->idle_expires.tv64 = KTIME_MAX; - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); - goto out; - } - - /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); - ts->idle_expires = expires; - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); - } - raise_softirq_irqoff(TIMER_SOFTIRQ); -out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; - ts->sleep_length = ktime_sub(dev->next_event, now); -end: - local_irq_restore(flags); -} - -/** - * tick_nohz_get_sleep_length - return the length of the current sleep - * - * Called from power state control code with interrupts disabled - */ -ktime_t tick_nohz_get_sleep_length(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - return ts->sleep_length; -} - -/** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - */ -void tick_nohz_restart_sched_tick(void) -{ - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - unsigned long ticks; - ktime_t now; - - local_irq_disable(); - tick_nohz_stop_idle(cpu); - - if (!ts->inidle || !ts->tick_stopped) { - ts->inidle = 0; - local_irq_enable(); - return; - } - - ts->inidle = 0; - - rcu_exit_nohz(); - - /* Update jiffies first */ - select_nohz_load_balancer(0); - now = ktime_get(); - tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); - - /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! - */ - ticks = jiffies - ts->idle_jiffies; - /* - * We might be one off. Do not randomly account a huge number of ticks! - */ - if (ticks && ticks < LONG_MAX) { - add_preempt_count(HARDIRQ_OFFSET); - account_system_time(current, HARDIRQ_OFFSET, - jiffies_to_cputime(ticks)); - sub_preempt_count(HARDIRQ_OFFSET); - } - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } - local_irq_enable(); -} - -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(ts->sched_timer.expires, 0); -} - -/* - * The nohz low res interrupt handler - */ -static void tick_nohz_handler(struct clock_event_device *dev) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); - ktime_t now = ktime_get(); - - dev->next_event.tv64 = KTIME_MAX; - - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - -/** - * tick_nohz_switch_to_nohz - switch to nohz mode - */ -static void tick_nohz_switch_to_nohz(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - ktime_t next; - - if (!tick_nohz_enabled) - return; - - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); - return; - } - - ts->nohz_mode = NOHZ_MODE_LOWRES; - - /* - * Recycle the hrtimer in ts, so we can share the - * hrtimer_forward with the highres code. - */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - /* Get the next period */ - next = tick_init_jiffy_update(); - - for (;;) { - ts->sched_timer.expires = next; - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } - local_irq_enable(); - - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); -} - -#else - -static inline void tick_nohz_switch_to_nohz(void) { } - -#endif /* NO_HZ */ - -/* - * High resolution timer specific code - */ -#ifdef CONFIG_HIGH_RES_TIMERS -/* - * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. - */ -static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) -{ - struct tick_sched *ts = - container_of(timer, struct tick_sched, sched_timer); - struct pt_regs *regs = get_irq_regs(); - ktime_t now = ktime_get(); - int cpu = smp_processor_id(); - -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * Do not call, when we are not in irq context and have - * no valid regs pointer - */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } - - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - - hrtimer_forward(timer, now, tick_period); - - return HRTIMER_RESTART; -} - -/** - * tick_setup_sched_timer - setup the tick emulation timer - */ -void tick_setup_sched_timer(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - ktime_t now = ktime_get(); - u64 offset; - - /* - * Emulate tick processing via per-CPU hrtimers: - */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; - - /* Get the next period (per cpu) */ - ts->sched_timer.expires = tick_init_jiffy_update(); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); - - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } - -#ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) - ts->nohz_mode = NOHZ_MODE_HIGHRES; -#endif -} -#endif /* HIGH_RES_TIMERS */ - -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS -void tick_cancel_sched_timer(int cpu) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - -# ifdef CONFIG_HIGH_RES_TIMERS - if (ts->sched_timer.base) - hrtimer_cancel(&ts->sched_timer); -# endif - - ts->nohz_mode = NOHZ_MODE_INACTIVE; -} -#endif - -/** - * Async notification about clocksource changes - */ -void tick_clock_notify(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); -} - -/* - * Async notification about clock event changes - */ -void tick_oneshot_notify(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - set_bit(0, &ts->check_clocks); -} - -/** - * Check, if a change happened, which makes oneshot possible. - * - * Called cyclic from the hrtimer softirq (driven by the timer - * softirq) allow_nohz signals, that we can switch into low-res nohz - * mode, because high resolution timers are disabled (either compile - * or runtime). - */ -int tick_check_oneshot_change(int allow_nohz) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - if (!test_and_clear_bit(0, &ts->check_clocks)) - return 0; - - if (ts->nohz_mode != NOHZ_MODE_INACTIVE) - return 0; - - if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) - return 0; - - if (!allow_nohz) - return 1; - - tick_nohz_switch_to_nohz(); - return 0; -} -/* - * linux/kernel/time/timekeeping.c - * - * Kernel timekeeping code and accessor functions - * - * This code was moved from linux/kernel/timer.c. - * Please see that file for copyright and history logs. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ - -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - /* - * Use temporary variable so get_seconds() cannot catch - * an intermediate xtime_cache.tv_sec value. - * The ACCESS_ONCE() keeps the compiler from optimizing - * out the intermediate value. - */ - struct timespec ts = xtime; - timespec_add_ns(&ts, nsec); - ACCESS_ONCE(xtime_cache) = ts; -} - -struct clocksource *clock; - - -#ifdef CONFIG_GENERIC_TIME -/** - * clocksource_forward_now - update clock to the current time - * - * Forward the current clock to update its state since the last call to - * update_wall_time(). This is useful before significant clock changes, - * as it avoids having to deal with this time offset explicitly. - */ -static void clocksource_forward_now(void) -{ - cycle_t cycle_now, cycle_delta; - s64 nsec; - - cycle_now = clocksource_read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; - - nsec = cyc2ns(clock, cycle_delta); - timespec_add_ns(&xtime, nsec); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - cycle_t cycle_now, cycle_delta; - unsigned long seq; - s64 nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -EXPORT_SYMBOL(getnstimeofday); - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - getnstimeofday(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(struct timespec *tv) -{ - struct timespec ts_delta; - unsigned long flags; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - clocksource_forward_now(); - - ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - - xtime = *tv; - - update_xtime_cache(0); - - clock->error = 0; - ntp_clear(); - - update_vsyscall(&xtime, clock); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static void change_clocksource(void) -{ - struct clocksource *new; - - new = clocksource_get_next(); - - if (clock == new) - return; - - clocksource_forward_now(); - - clock = new; - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(new); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - - tick_clock_notify(); - - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ -} -#else -static inline void clocksource_forward_now(void) { } -static inline void change_clocksource(void) { } -#endif - -/** - * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres - */ -int timekeeping_valid_for_hres(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; -} - -/** - * read_persistent_clock - Return time in seconds from the persistent clock. - * - * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -unsigned long __attribute__((weak)) read_persistent_clock(void) -{ - return 0; -} - -/* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - unsigned long flags; - unsigned long sec = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_init(); - - clock = clocksource_get_next(); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - update_xtime_cache(0); - total_sleep_time = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); -} - -/* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; - -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. - */ -static int timekeeping_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long now = read_persistent_clock(); - - clocksource_resume(); - - write_seqlock_irqsave(&xtime_lock, flags); - - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; - } - update_xtime_cache(0); - /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - - touch_softlockup_watchdog(); - - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ - hres_timers_resume(); - - return 0; -} - -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - timekeeping_suspend_time = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); - timekeeping_suspended = 1; - write_sequnlock_irqrestore(&xtime_lock, flags); - - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) -{ - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(timekeeping_init_device); - -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void clocksource_adjust(s64 offset) -{ - s64 error, interval = clock->cycle_interval; - int adj; - - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); - if (error > interval) { - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = clocksource_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = clocksource_bigadjust(error, &interval, &offset); - } else - return; - - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); -} - -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. - */ -void update_wall_time(void) -{ - cycle_t offset; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - -#ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; -#else - offset = clock->cycle_interval; -#endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. - */ - while (offset >= clock->cycle_interval) { - /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; - - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; - xtime.tv_sec++; - second_overflow(); - } - - /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); - } - - /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); - - /* - * Since in the loop above, we accumulate any amount of time - * in xtime_nsec over a second into xtime.tv_sec, its possible for - * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), - * its possible the required corrective factor to xtime_nsec could - * cause it to underflow. - * - * Now, we cannot simply roll the accumulated second back, since - * the NTP subsystem has been notified via second_overflow. So - * instead we push xtime_nsec forward by the amount we underflowed, - * and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. - */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); - } - - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - - update_xtime_cache(cyc2ns(clock, offset)); - - /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); -} - -/** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - * - * This is based on the wall_to_monotonic offset and the total suspend - * time. Calls to settimeofday will affect the value returned (which - * basically means that however wrong your real time clock is at boot time, - * you get the right time here). - */ -void getboottime(struct timespec *ts) -{ - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); -} - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - ts->tv_sec += total_sleep_time; -} - -unsigned long get_seconds(void) -{ - return xtime_cache.tv_sec; -} -EXPORT_SYMBOL(get_seconds); - - -struct timespec current_kernel_time(void) -{ - struct timespec now; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; - } while (read_seqretry(&xtime_lock, seq)); - - return now; -} -EXPORT_SYMBOL(current_kernel_time); -/* - * kernel/time/timer_list.c - * - * List pending timers - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); - -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - -/* - * This allows printing both to /proc/timer_list and - * to the console (on SysRq-Q): - */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) - -static void print_name_offset(struct seq_file *m, void *sym) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%p>", sym); - else - SEQ_printf(m, "%s", symname); -} - -static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) -{ -#ifdef CONFIG_TIMER_STATS - char tmp[TASK_COMM_LEN + 1]; -#endif - SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); - SEQ_printf(m, ", "); - print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); -#ifdef CONFIG_TIMER_STATS - SEQ_printf(m, ", "); - print_name_offset(m, timer->start_site); - memcpy(tmp, timer->start_comm, TASK_COMM_LEN); - tmp[TASK_COMM_LEN] = 0; - SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); -#endif - SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", - (unsigned long long)ktime_to_ns(timer->expires), - (long long)(ktime_to_ns(timer->expires) - now)); -} - -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) -{ - struct hrtimer *timer, tmp; - unsigned long next = 0, i; - struct rb_node *curr; - unsigned long flags; - -next_one: - i = 0; - spin_lock_irqsave(&base->cpu_base->lock, flags); - - curr = base->first; - /* - * Crude but we have to do this O(N*N) thing, because - * we have to unlock the base when printing: - */ - while (curr && i < next) { - curr = rb_next(curr); - i++; - } - - if (curr) { - - timer = rb_entry(curr, struct hrtimer, node); - tmp = *timer; - spin_unlock_irqrestore(&base->cpu_base->lock, flags); - - print_timer(m, &tmp, i, now); - next++; - goto next_one; - } - spin_unlock_irqrestore(&base->cpu_base->lock, flags); -} - -static void -print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) -{ - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); - SEQ_printf(m, " .get_time: "); - print_name_offset(m, base->get_time); - SEQ_printf(m, "\n"); -#ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Lu nsecs\n", - (unsigned long long) ktime_to_ns(base->offset)); -#endif - SEQ_printf(m, "active timers:\n"); - print_active_timers(m, base, now); -} - -static void print_cpu(struct seq_file *m, int cpu, u64 now) -{ - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - - SEQ_printf(m, "\n"); - SEQ_printf(m, "cpu: %d\n", cpu); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - SEQ_printf(m, " clock %d:\n", i); - print_base(m, cpu_base->clock_base + i, now); - } -#define P(x) \ - SEQ_printf(m, " .%-15s: %Lu\n", #x, \ - (unsigned long long)(cpu_base->x)) -#define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ - (unsigned long long)(ktime_to_ns(cpu_base->x))) - -#ifdef CONFIG_HIGH_RES_TIMERS - P_ns(expires_next); - P(hres_active); - P(nr_events); -#endif -#undef P -#undef P_ns - -#ifdef CONFIG_TICK_ONESHOT -# define P(x) \ - SEQ_printf(m, " .%-15s: %Lu\n", #x, \ - (unsigned long long)(ts->x)) -# define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ - (unsigned long long)(ktime_to_ns(ts->x))) - { - struct tick_sched *ts = tick_get_tick_sched(cpu); - P(nohz_mode); - P_ns(idle_tick); - P(tick_stopped); - P(idle_jiffies); - P(idle_calls); - P(idle_sleeps); - P_ns(idle_entrytime); - P_ns(idle_waketime); - P_ns(idle_exittime); - P_ns(idle_sleeptime); - P(last_jiffies); - P(next_jiffies); - P_ns(idle_expires); - SEQ_printf(m, "jiffies: %Lu\n", - (unsigned long long)jiffies); - } -#endif - -#undef P -#undef P_ns -} - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -static void -print_tickdevice(struct seq_file *m, struct tick_device *td) -{ - struct clock_event_device *dev = td->evtdev; - - SEQ_printf(m, "\n"); - SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); - - SEQ_printf(m, "Clock Event Device: "); - if (!dev) { - SEQ_printf(m, "\n"); - return; - } - SEQ_printf(m, "%s\n", dev->name); - SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); - SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); - SEQ_printf(m, " mult: %lu\n", dev->mult); - SEQ_printf(m, " shift: %d\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); - SEQ_printf(m, " next_event: %Ld nsecs\n", - (unsigned long long) ktime_to_ns(dev->next_event)); - - SEQ_printf(m, " set_next_event: "); - print_name_offset(m, dev->set_next_event); - SEQ_printf(m, "\n"); - - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); - SEQ_printf(m, "\n"); - - SEQ_printf(m, " event_handler: "); - print_name_offset(m, dev->event_handler); - SEQ_printf(m, "\n"); -} - -static void timer_list_show_tickdevices(struct seq_file *m) -{ - int cpu; - -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - tick_get_broadcast_mask()->bits[0]); -#ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - tick_get_broadcast_oneshot_mask()->bits[0]); -#endif - SEQ_printf(m, "\n"); -#endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); - SEQ_printf(m, "\n"); -} -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } -#endif - -static int timer_list_show(struct seq_file *m, void *v) -{ - u64 now = ktime_to_ns(ktime_get()); - int cpu; - - SEQ_printf(m, "Timer List Version: v0.3\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); - - for_each_online_cpu(cpu) - print_cpu(m, cpu, now); - - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); - - return 0; -} - -void sysrq_timer_list_show(void) -{ - timer_list_show(NULL, NULL); -} - -static int timer_list_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, timer_list_show, NULL); -} - -static struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_timer_list_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_timer_list_procfs); -/* - * kernel/time/timer_stats.c - * - * Collect timer usage statistics. - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006 Timesys Corp., Thomas Gleixner - * - * timer_stats is based on timer_top, a similar functionality which was part of - * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the - * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based - * on dynamic allocation of the statistics entries and linear search based - * lookup combined with a global lock, rather than the static array, hash - * and per-CPU locking which is used by timer_stats. It was written for the - * pre hrtimer kernel code and therefore did not take hrtimers into account. - * Nevertheless it provided the base for the timer_stats implementation and - * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks - * for this effort. - * - * timer_top.c is - * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus - * Written by Daniel Petrini - * timer_top.c was released under the GNU General Public License version 2 - * - * We export the addresses and counting of timer functions being called, - * the pid and cmdline from the owner process if applicable. - * - * Start/stop data collection: - * # echo [1|0] >/proc/timer_stats - * - * Display the information collected so far: - * # cat /proc/timer_stats - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * This is our basic unit of interest: a timer expiry event identified - * by the timer, its start/expire functions and the PID of the task that - * started the timer. We count the number of times an event happens: - */ -struct entry { - /* - * Hash list: - */ - struct entry *next; - - /* - * Hash keys: - */ - void *timer; - void *start_func; - void *expire_func; - pid_t pid; - - /* - * Number of timeout events: - */ - unsigned long count; - unsigned int timer_flag; - - /* - * We save the command-line string to preserve - * this information past task exit: - */ - char comm[TASK_COMM_LEN + 1]; - -} ____cacheline_aligned_in_smp; - -/* - * Spinlock protecting the tables - not taken during lookup: - */ -static DEFINE_SPINLOCK(table_lock); - -/* - * Per-CPU lookup locks for fast hash lookup: - */ -static DEFINE_PER_CPU(spinlock_t, lookup_lock); - -/* - * Mutex to serialize state changes with show-stats activities: - */ -static DEFINE_MUTEX(show_mutex); - -/* - * Collection status, active/inactive: - */ -static int __read_mostly active; - -/* - * Beginning/end timestamps of measurement: - */ -static ktime_t time_start, time_stop; - -/* - * tstat entry structs only get allocated while collection is - * active and never freed during that time - this simplifies - * things quite a bit. - * - * They get freed when a new collection period is started. - */ -#define MAX_ENTRIES_BITS 10 -#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) - -static unsigned long nr_entries; -static struct entry entries[MAX_ENTRIES]; - -static atomic_t overflow_count; - -/* - * The entries are in a hash-table, for fast lookup: - */ -#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) -#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) -#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) - -#define __tstat_hashfn(entry) \ - (((unsigned long)(entry)->timer ^ \ - (unsigned long)(entry)->start_func ^ \ - (unsigned long)(entry)->expire_func ^ \ - (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) - -#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) - -static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; - -static void reset_entries(void) -{ - nr_entries = 0; - memset(entries, 0, sizeof(entries)); - memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); - atomic_set(&overflow_count, 0); -} - -static struct entry *alloc_entry(void) -{ - if (nr_entries >= MAX_ENTRIES) - return NULL; - - return entries + nr_entries++; -} - -static int match_entries(struct entry *entry1, struct entry *entry2) -{ - return entry1->timer == entry2->timer && - entry1->start_func == entry2->start_func && - entry1->expire_func == entry2->expire_func && - entry1->pid == entry2->pid; -} - -/* - * Look up whether an entry matching this item is present - * in the hash already. Must be called with irqs off and the - * lookup lock held: - */ -static struct entry *tstat_lookup(struct entry *entry, char *comm) -{ - struct entry **head, *curr, *prev; - - head = tstat_hashentry(entry); - curr = *head; - - /* - * The fastpath is when the entry is already hashed, - * we do this with the lookup lock held, but with the - * table lock not held: - */ - while (curr) { - if (match_entries(curr, entry)) - return curr; - - curr = curr->next; - } - /* - * Slowpath: allocate, set up and link a new hash entry: - */ - prev = NULL; - curr = *head; - - spin_lock(&table_lock); - /* - * Make sure we have not raced with another CPU: - */ - while (curr) { - if (match_entries(curr, entry)) - goto out_unlock; - - prev = curr; - curr = curr->next; - } - - curr = alloc_entry(); - if (curr) { - *curr = *entry; - curr->count = 0; - curr->next = NULL; - memcpy(curr->comm, comm, TASK_COMM_LEN); - - smp_mb(); /* Ensure that curr is initialized before insert */ - - if (prev) - prev->next = curr; - else - *head = curr; - } - out_unlock: - spin_unlock(&table_lock); - - return curr; -} - -/** - * timer_stats_update_stats - Update the statistics for a timer. - * @timer: pointer to either a timer_list or a hrtimer - * @pid: the pid of the task which set up the timer - * @startf: pointer to the function which did the timer setup - * @timerf: pointer to the timer callback function of the timer - * @comm: name of the process which set up the timer - * - * When the timer is already registered, then the event counter is - * incremented. Otherwise the timer is registered in a free slot. - */ -void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) -{ - /* - * It doesnt matter which lock we take: - */ - spinlock_t *lock; - struct entry *entry, input; - unsigned long flags; - - if (likely(!active)) - return; - - lock = &per_cpu(lookup_lock, raw_smp_processor_id()); - - input.timer = timer; - input.start_func = startf; - input.expire_func = timerf; - input.pid = pid; - input.timer_flag = timer_flag; - - spin_lock_irqsave(lock, flags); - if (!active) - goto out_unlock; - - entry = tstat_lookup(&input, comm); - if (likely(entry)) - entry->count++; - else - atomic_inc(&overflow_count); - - out_unlock: - spin_unlock_irqrestore(lock, flags); -} - -static void print_name_offset(struct seq_file *m, unsigned long addr) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name(addr, symname) < 0) - seq_printf(m, "<%p>", (void *)addr); - else - seq_printf(m, "%s", symname); -} - -static int tstats_show(struct seq_file *m, void *v) -{ - struct timespec period; - struct entry *entry; - unsigned long ms; - long events = 0; - ktime_t time; - int i; - - mutex_lock(&show_mutex); - /* - * If still active then calculate up to now: - */ - if (active) - time_stop = ktime_get(); - - time = ktime_sub(time_stop, time_start); - - period = ktime_to_timespec(time); - ms = period.tv_nsec / 1000000; - - seq_puts(m, "Timer Stats Version: v0.2\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); - if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); - - for (i = 0; i < nr_entries; i++) { - entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { - seq_printf(m, "%4luD, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } else { - seq_printf(m, " %4lu, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } - - print_name_offset(m, (unsigned long)entry->start_func); - seq_puts(m, " ("); - print_name_offset(m, (unsigned long)entry->expire_func); - seq_puts(m, ")\n"); - - events += entry->count; - } - - ms += period.tv_sec * 1000; - if (!ms) - ms = 1; - - if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", - events, events * 1000 / ms, - (events * 1000000 / ms) % 1000); - else - seq_printf(m, "%ld total events\n", events); - - mutex_unlock(&show_mutex); - - return 0; -} - -/* - * After a state change, make sure all concurrent lookup/update - * activities have stopped: - */ -static void sync_access(void) -{ - unsigned long flags; - int cpu; - - for_each_online_cpu(cpu) { - spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); - /* nothing */ - spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); - } -} - -static ssize_t tstats_write(struct file *file, const char __user *buf, - size_t count, loff_t *offs) -{ - char ctl[2]; - - if (count != 2 || *offs) - return -EINVAL; - - if (copy_from_user(ctl, buf, count)) - return -EFAULT; - - mutex_lock(&show_mutex); - switch (ctl[0]) { - case '0': - if (active) { - active = 0; - time_stop = ktime_get(); - sync_access(); - } - break; - case '1': - if (!active) { - reset_entries(); - time_start = ktime_get(); - smp_mb(); - active = 1; - } - break; - default: - count = -EINVAL; - } - mutex_unlock(&show_mutex); - - return count; -} - -static int tstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, tstats_show, NULL); -} - -static struct file_operations tstats_fops = { - .open = tstats_open, - .read = seq_read, - .write = tstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -void __init init_timer_stats(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(lookup_lock, cpu)); -} - -static int __init init_tstats_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_tstats_procfs); -/* - * linux/kernel/time.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c - * - * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() - * 1993-10-08 Torsten Duwe - * adjtime interface update and CMOS clock write code - * 1995-08-13 Torsten Duwe - * kernel PLL updated to 1994-12-13 specs (rfc-1589) - * 1999-01-16 Ulrich Windl - * Introduced error checking for many cases in adjtimex(). - * Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) - * (Even though the technical memorandum forbids it) - * 2004-07-14 Christoph Lameter - * Added getnstimeofday to allow the posix timer functions to return - * with nanosecond accuracy - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "timeconst.h" - -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz; - -EXPORT_SYMBOL(sys_tz); - -#ifdef __ARCH_WANT_SYS_TIME - -/* - * sys_time() can be implemented in user-level using - * sys_gettimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) -{ - time_t i = get_seconds(); - - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} - -/* - * sys_stime() can be implemented in user-level using - * sys_settimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ - -SYSCALL_DEFINE1(stime, time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_SYS_TIME */ - -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) - return -EFAULT; - } - if (unlikely(tz != NULL)) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - write_seqlock_irq(&xtime_lock); - wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; - xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); -} - -/* - * In case for some reason the CMOS clock has not already been running - * in UTC, but in some local time: The first time we set the timezone, - * we will warp the clock so that it is ticking UTC time instead of - * local time. Presumably, if someone is setting the timezone then we - * are running in an environment where the programs understand about - * timezones. This should be done at boot time in the /etc/rc script, - * as soon as possible, so that the clock can be set right. Otherwise, - * various programs will get confused when the clock gets warped. - */ - -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) -{ - static int firsttime = 1; - int error = 0; - - if (tv && !timespec_valid(tv)) - return -EINVAL; - - error = security_settime(tv, tz); - if (error) - return error; - - if (tz) { - /* SMP safe, global irq locking makes it work. */ - sys_tz = *tz; - update_vsyscall_tz(); - if (firsttime) { - firsttime = 0; - if (!tv) - warp_clock(); - } - } - if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ - return do_settimeofday(tv); - } - return 0; -} - -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timeval user_tv; - struct timespec new_ts; - struct timezone new_tz; - - if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) -{ - struct timex txc; /* Local copy of parameter */ - int ret; - - /* Copy the user data space into the kernel copy - * structure. But bear in mind that the structures - * may change - */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) - return -EFAULT; - ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; -} - -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - -/* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int inline jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; -# else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int inline jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else - return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { - /* nothing */ - } else if (gran == 1000000000) { - t.tv_nsec = 0; - } else { - t.tv_nsec -= t.tv_nsec % gran; - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) - */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) -{ - unsigned int mon = mon0, year = year0; - - /* 1..12 -> 11,12,1..10 */ - if (0 >= (int) (mon -= 2)) { - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - - return ((((unsigned long) - (year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ -} - -EXPORT_SYMBOL(mktime); - -/** - * set_normalized_timespec - set timespec sec and nsec parts and normalize - * - * @ts: pointer to timespec variable to be set - * @sec: seconds to set - * @nsec: nanoseconds to set - * - * Set seconds and nanoseconds field of a timespec variable and - * normalize to the timespec storage format - * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC - * For negative values only the tv_sec field is negative ! - */ -void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) -{ - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} -EXPORT_SYMBOL(set_normalized_timespec); - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - -/* - * When we convert to jiffies then we interpret incoming values - * the following way: - * - * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) - * - * - 'too large' values [that would result in larger than - * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. - * - * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. - */ -unsigned long msecs_to_jiffies(const unsigned int m) -{ - /* - * Negative value, means infinite timeout: - */ - if ((int)m < 0) - return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(msecs_to_jiffies); - -unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(usecs_to_jiffies); - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. - */ -unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} -EXPORT_SYMBOL(timespec_to_jiffies); - -void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_nsec = rem; -} -EXPORT_SYMBOL(jiffies_to_timespec); - -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -clock_t jiffies_to_clock_t(long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - return x * (USER_HZ / HZ); -# else - return x / (HZ / USER_HZ); -# endif -#else - return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); -#endif -} -EXPORT_SYMBOL(jiffies_to_clock_t); - -unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - return div_u64((u64)x * HZ, USER_HZ); -#endif -} -EXPORT_SYMBOL(clock_t_to_jiffies); - -u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - x = div_u64(x * USER_HZ, HZ); -# elif HZ > USER_HZ - x = div_u64(x, HZ / USER_HZ); -# else - /* Nothing to do */ -# endif -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} -EXPORT_SYMBOL(jiffies_64_to_clock_t); - -u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - return div_u64(x, NSEC_PER_SEC / USER_HZ); -#elif (USER_HZ % 512) == 0 - return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); -#endif -} - -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) -{ - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; -} -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); -/* - * linux/kernel/timer.c - * - * Kernel internal timers, basic process system calls - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli - * 1999-03-10 Improved NTP compatibility by Ulrich Windl - * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love - * 2000-10-05 Implemented scalable SMP per-CPU timer handling. - * Copyright (C) 2000, 2001, 2002 Ingo Molnar - * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -/* - * per-CPU timer vector definitions: - */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct tvec { - struct list_head vec[TVN_SIZE]; -}; - -struct tvec_root { - struct list_head vec[TVR_SIZE]; -}; - -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; -} ____cacheline_aligned; - -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; - -/* - * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable - */ -#define TBASE_DEFERRABLE_FLAG (0x1) - -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); -} - -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) -{ - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); -} - -static inline void timer_set_deferrable(struct timer_list *timer) -{ - timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); -} - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) -{ - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); -} - -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - int rem; - unsigned long original = j; - - /* - * We don't want all cpus firing their timers at once hitting the - * same lock or cachelines, so we skew each extra cpu with an extra - * 3 jiffies. This 3 jiffies came originally from the mm/ code which - * already did this. - * The skew is done by adding 3*cpunr, then round, then subtract this - * extra offset again. - */ - j += cpu * 3; - - rem = j % HZ; - - /* - * If the target jiffie is just after a whole second (which can happen - * due to delays of the timer irq, long irq off times etc etc) then - * we should round down to the whole second, not up. Use 1/4th second - * as cutoff for this rounding as an extreme upper bound for this. - */ - if (rem < HZ/4) /* round down */ - j = j - rem; - else /* round up */ - j = j - rem + HZ; - - /* now that we have rounded, subtract the extra skew again */ - j -= cpu * 3; - - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** - * __round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies_relative(unsigned long j, int cpu) -{ - /* - * In theory the following code can skip a jiffy in case jiffies - * increments right between the addition and the later subtraction. - * However since the entire point of this function is to use approximate - * timeouts, it's entirely ok to not handle that. - */ - return __round_jiffies(j + jiffies, cpu) - jiffies; -} -EXPORT_SYMBOL_GPL(__round_jiffies_relative); - -/** - * round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies(unsigned long j) -{ - return __round_jiffies(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies); - -/** - * round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies_relative(unsigned long j) -{ - return __round_jiffies_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_relative); - - -static inline void set_running_timer(struct tvec_base *base, - struct timer_list *timer) -{ -#ifdef CONFIG_SMP - base->running_timer = timer; -#endif -} - -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) -{ - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); - } else { - int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: - */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; - expires = idx + base->timer_jiffies; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); -} - -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - unsigned int flag = 0; - - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; - - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr timer_debug_descr; - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_init(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - WARN_ON_ONCE(1); - } - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_free(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, -}; - -static inline void debug_timer_init(struct timer_list *timer) -{ - debug_object_init(timer, &timer_debug_descr); -} - -static inline void debug_timer_activate(struct timer_list *timer) -{ - debug_object_activate(timer, &timer_debug_descr); -} - -static inline void debug_timer_deactivate(struct timer_list *timer) -{ - debug_object_deactivate(timer, &timer_debug_descr); -} - -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - -static void __init_timer(struct timer_list *timer); - -void init_timer_on_stack(struct timer_list *timer) -{ - debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer); -} -EXPORT_SYMBOL_GPL(init_timer_on_stack); - -void destroy_timer_on_stack(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); - -#else -static inline void debug_timer_init(struct timer_list *timer) { } -static inline void debug_timer_activate(struct timer_list *timer) { } -static inline void debug_timer_deactivate(struct timer_list *timer) { } -#endif - -static void __init_timer(struct timer_list *timer) -{ - timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif -} - -/** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. - */ -void init_timer(struct timer_list *timer) -{ - debug_timer_init(timer); - __init_timer(timer); -} -EXPORT_SYMBOL(init_timer); - -void init_timer_deferrable(struct timer_list *timer) -{ - init_timer(timer); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable); - -static inline void detach_timer(struct timer_list *timer, - int clear_pending) -{ - struct list_head *entry = &timer->entry; - - debug_timer_deactivate(timer); - - __list_del(entry->prev, entry->next); - if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; -} - -/* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) - __acquires(timer->base->lock) -{ - struct tvec_base *base; - - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) - return base; - /* The timer has migrated to another CPU */ - spin_unlock_irqrestore(&base->lock, *flags); - } - cpu_relax(); - } -} - -int __mod_timer(struct timer_list *timer, unsigned long expires) -{ - struct tvec_base *base, *new_base; - unsigned long flags; - int ret = 0; - - timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); - - if (timer_pending(timer)) { - detach_timer(timer, 0); - ret = 1; - } - - debug_timer_activate(timer); - - new_base = __get_cpu_var(tvec_bases); - - if (base != new_base) { - /* - * We are trying to schedule the timer on the local CPU. - * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. - */ - if (likely(base->running_timer != timer)) { - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&base->lock); - base = new_base; - spin_lock(&base->lock); - timer_set_base(timer, base); - } - } - - timer->expires = expires; - internal_add_timer(base, timer); - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -EXPORT_SYMBOL(__mod_timer); - -/** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on - * - * This is not very scalable on SMP. Double adds are not possible. - */ -void add_timer_on(struct timer_list *timer, int cpu) -{ - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_timer_activate(timer); - internal_add_timer(base, timer); - /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. - */ - wake_up_idle_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); -} - -/** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * - * mod_timer(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - * - * Note that if there are multiple unserialized concurrent users of the - * same timer, then mod_timer() is the only safe way to modify the timeout, - * since add_timer() cannot modify an already running timer. - * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) - */ -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - BUG_ON(!timer->function); - - timer_stats_timer_set_start_info(timer); - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires); -} - -EXPORT_SYMBOL(mod_timer); - -/** - * del_timer - deactive a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) - */ -int del_timer(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = 0; - - timer_stats_timer_clear_start_info(timer); - if (timer_pending(timer)) { - base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - ret = 1; - } - spin_unlock_irqrestore(&base->lock, flags); - } - - return ret; -} - -EXPORT_SYMBOL(del_timer); - -#ifdef CONFIG_SMP -/** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del - * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. - * - * It must not be called from interrupt contexts. - */ -int try_to_del_timer_sync(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = -1; - - base = lock_timer_base(timer, &flags); - - if (base->running_timer == timer) - goto out; - - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - ret = 1; - } -out: - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -EXPORT_SYMBOL(try_to_del_timer_sync); - -/** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: Callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. - * - * The function returns whether it has deactivated a pending timer or not. - */ -int del_timer_sync(struct timer_list *timer) -{ - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } -} - -EXPORT_SYMBOL(del_timer_sync); -#endif - -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; - - list_replace_init(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); - } - - return index; -} - -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) -{ - struct timer_list *timer; - - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; - - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); - while (!list_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - - timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; - - timer_stats_account_timer(timer); - - set_running_timer(base, timer); - detach_timer(timer, 1); - spin_unlock_irq(&base->lock); - { - int preempt_count = preempt_count(); - fn(data); - if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); - } - } - spin_lock_irq(&base->lock); - } - } - set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); -} - -#ifdef CONFIG_NO_HZ -/* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a cpus is idle. - * This functions needs to be called disabled. - */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; - do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; - } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = timer_jiffies & TVN_MASK; - do { - list_for_each_entry(nte, varp->vec + slot, entry) { - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; - } - return expires; -} - -/* - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) -{ - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; - - /* - * Expired timer available, let it expire in the next tick - */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); - - /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): - */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; - - /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq - */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; -} - -/** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) - */ -unsigned long get_next_timer_interrupt(unsigned long now) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - unsigned long expires; - - spin_lock(&base->lock); - expires = __next_timer_interrupt(base); - spin_unlock(&base->lock); - - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); -} -#endif - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - - if (user_tick) { - account_user_time(p, one_jiffy); - account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); - } else { - account_system_time(p, HARDIRQ_OFFSET, one_jiffy); - account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); - } -} -#endif - -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_tick); - printk_tick(); - scheduler_tick(); - run_posix_cpu_timers(p); -} - -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* - * Called by the local, per-CPU timer interrupt on SMP. - */ -void run_local_timers(void) -{ - hrtimer_run_queues(); - raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); -} - -/* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_times(ticks); -} - -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -SYSCALL_DEFINE0(getpid) -{ - return task_tgid_vnr(current); -} - -/* - * Accessing ->real_parent is not SMP-safe, it could - * change from under us. However, we can use a stale - * value of ->real_parent under rcu_read_lock(), see - * release_task()->call_rcu(delayed_put_task_struct). - */ -SYSCALL_DEFINE0(getppid) -{ - int pid; - - rcu_read_lock(); - pid = task_tgid_vnr(current->real_parent); - rcu_read_unlock(); - - return pid; -} - -SYSCALL_DEFINE0(getuid) -{ - /* Only we change this so SMP safe */ - return current->uid; -} - -SYSCALL_DEFINE0(geteuid) -{ - /* Only we change this so SMP safe */ - return current->euid; -} - -SYSCALL_DEFINE0(getgid) -{ - /* Only we change this so SMP safe */ - return current->gid; -} - -SYSCALL_DEFINE0(getegid) -{ - /* Only we change this so SMP safe */ - return current->egid; -} - -#endif - -static void process_timeout(unsigned long __data) -{ - wake_up_process((struct task_struct *)__data); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire); - schedule(); - del_singleshot_timer_sync(&timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* Thread ID - the internal kernel "pid" */ -SYSCALL_DEFINE0(gettid) -{ - return task_pid_vnr(current); -} - -/** - * do_sysinfo - fill in sysinfo struct - * @info: pointer to buffer to fill - */ -int do_sysinfo(struct sysinfo *info) -{ - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - unsigned long seq; - - memset(info, 0, sizeof(struct sysinfo)); - - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); - - si_meminfo(info); - si_swapinfo(info); - - /* - * If the sum of all the available memory (i.e. ram + swap) - * is less than can be stored in a 32 bit unsigned long then - * we can be binary compatible with 2.2.x kernels. If not, - * well, in that case 2.2.x was broken anyways... - * - * -Erik Andersen - */ - - mem_total = info->totalram + info->totalswap; - if (mem_total < info->totalram || mem_total < info->totalswap) - goto out; - bitcount = 0; - mem_unit = info->mem_unit; - while (mem_unit > 1) { - bitcount++; - mem_unit >>= 1; - sav_total = mem_total; - mem_total <<= 1; - if (mem_total < sav_total) - goto out; - } - - /* - * If mem_total did not overflow, multiply all memory values by - * info->mem_unit and set it to 1. This leaves things compatible - * with 2.2.x, and also retains compatibility with earlier 2.4.x - * kernels... - */ - - info->mem_unit = 1; - info->totalram <<= bitcount; - info->freeram <<= bitcount; - info->sharedram <<= bitcount; - info->bufferram <<= bitcount; - info->totalswap <<= bitcount; - info->freeswap <<= bitcount; - info->totalhigh <<= bitcount; - info->freehigh <<= bitcount; - -out: - return 0; -} - -SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) -{ - struct sysinfo val; - - do_sysinfo(&val); - - if (copy_to_user(info, &val, sizeof(struct sysinfo))) - return -EFAULT; - - return 0; -} - -static int __cpuinit init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - tvec_base_done[cpu] = 1; - } else { - base = per_cpu(tvec_bases, cpu); - } - - spin_lock_init(&base->lock); - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) -{ - struct timer_list *timer; - - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); - timer_set_base(timer, new_base); - internal_add_timer(new_base, timer); - } -} - -static void __cpuinit migrate_timers(int cpu) -{ - struct tvec_base *old_base; - struct tvec_base *new_base; - int i; - - BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); - put_cpu_var(tvec_bases); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (init_timers_cpu(cpu) < 0) - return NOTIFY_BAD; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers(cpu); - break; -#endif - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata timers_nb = { - .notifier_call = timer_cpu_notify, -}; - - -void __init init_timers(void) -{ - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - - init_timer_stats(); - - BUG_ON(err == NOTIFY_BAD); - register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq); -} - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); -/* - * Infrastructure for profiling code inserted by 'gcc -pg'. - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2004-2008 Ingo Molnar - * - * Originally ported from the -rt patch by: - * Copyright (C) 2007 Arnaldo Carvalho de Melo - * - * Based on code in the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - -/* ftrace_enabled is a method to turn ftrace on or off */ -int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; - -/* - * ftrace_disabled is set when an anomaly is discovered. - * ftrace_disabled is much stronger than ftrace_enabled. - */ -static int ftrace_disabled __read_mostly; - -static DEFINE_SPINLOCK(ftrace_lock); -static DEFINE_MUTEX(ftrace_sysctl_lock); - -static struct ftrace_ops ftrace_list_end __read_mostly = -{ - .func = ftrace_stub, -}; - -static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; -ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; - -static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) -{ - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); - - while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); - op->func(ip, parent_ip); - op = op->next; - }; -} - -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - -static int __register_ftrace_function(struct ftrace_ops *ops) -{ - /* Should never be called by interrupts */ - spin_lock(&ftrace_lock); - - ops->next = ftrace_list; - /* - * We are entering ops into the ftrace_list but another - * CPU might be walking that list. We need to make sure - * the ops->next pointer is valid before another CPU sees - * the ops pointer included into the ftrace_list. - */ - smp_wmb(); - ftrace_list = ops; - - if (ftrace_enabled) { - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; - } - - spin_unlock(&ftrace_lock); - - return 0; -} - -static int __unregister_ftrace_function(struct ftrace_ops *ops) -{ - struct ftrace_ops **p; - int ret = 0; - - spin_lock(&ftrace_lock); - - /* - * If we are removing the last function, then simply point - * to the ftrace_stub. - */ - if (ftrace_list == ops && ops->next == &ftrace_list_end) { - ftrace_trace_function = ftrace_stub; - ftrace_list = &ftrace_list_end; - goto out; - } - - for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) - if (*p == ops) - break; - - if (*p != ops) { - ret = -1; - goto out; - } - - *p = (*p)->next; - - if (ftrace_enabled) { - /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; - } - - out: - spin_unlock(&ftrace_lock); - - return ret; -} - -#ifdef CONFIG_DYNAMIC_FTRACE - -static struct task_struct *ftraced_task; - -enum { - FTRACE_ENABLE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), -}; - -static int ftrace_filtered; -static int tracing_on; -static int frozen_record_count; - -static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; - -static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); - -static DEFINE_SPINLOCK(ftrace_shutdown_lock); -static DEFINE_MUTEX(ftraced_lock); -static DEFINE_MUTEX(ftrace_regex_lock); - -struct ftrace_page { - struct ftrace_page *next; - unsigned long index; - struct dyn_ftrace records[]; -}; - -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -static int ftraced_trigger; -static int ftraced_suspend; -static int ftraced_stop; - -static int ftrace_record_suspend; - -static struct dyn_ftrace *ftrace_free_records; - - -#ifdef CONFIG_KPROBES -static inline void freeze_record(struct dyn_ftrace *rec) -{ - if (!(rec->flags & FTRACE_FL_FROZEN)) { - rec->flags |= FTRACE_FL_FROZEN; - frozen_record_count++; - } -} - -static inline void unfreeze_record(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_FROZEN) { - rec->flags &= ~FTRACE_FL_FROZEN; - frozen_record_count--; - } -} - -static inline int record_frozen(struct dyn_ftrace *rec) -{ - return rec->flags & FTRACE_FL_FROZEN; -} -#else -# define freeze_record(rec) ({ 0; }) -# define unfreeze_record(rec) ({ 0; }) -# define record_frozen(rec) ({ 0; }) -#endif /* CONFIG_KPROBES */ - -int skip_trace(unsigned long ip) -{ - unsigned long fl; - struct dyn_ftrace *rec; - struct hlist_node *t; - struct hlist_head *head; - - if (frozen_record_count == 0) - return 0; - - head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; - hlist_for_each_entry_rcu(rec, t, head, node) { - if (rec->ip == ip) { - if (record_frozen(rec)) { - if (rec->flags & FTRACE_FL_FAILED) - return 1; - - if (!(rec->flags & FTRACE_FL_CONVERTED)) - return 1; - - if (!tracing_on || !ftrace_enabled) - return 1; - - if (ftrace_filtered) { - fl = rec->flags & (FTRACE_FL_FILTER | - FTRACE_FL_NOTRACE); - if (!fl || (fl & FTRACE_FL_NOTRACE)) - return 1; - } - } - break; - } - } - - return 0; -} - -static inline int -ftrace_ip_in_hash(unsigned long ip, unsigned long key) -{ - struct dyn_ftrace *p; - struct hlist_node *t; - int found = 0; - - hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { - if (p->ip == ip) { - found = 1; - break; - } - } - - return found; -} - -static inline void -ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) -{ - hlist_add_head_rcu(&node->node, &ftrace_hash[key]); -} - -/* called from kstop_machine */ -static inline void ftrace_del_hash(struct dyn_ftrace *node) -{ - hlist_del(&node->node); -} - -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - /* no locking, only called from kstop_machine */ - - rec->ip = (unsigned long)ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - WARN_ON_ONCE(1); - ftrace_free_records = NULL; - ftrace_disabled = 1; - ftrace_enabled = 0; - return NULL; - } - - ftrace_free_records = (void *)rec->ip; - memset(rec, 0, sizeof(*rec)); - return rec; - } - - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; - ftrace_pages = ftrace_pages->next; - } - - return &ftrace_pages->records[ftrace_pages->index++]; -} - -static void -ftrace_record_ip(unsigned long ip) -{ - struct dyn_ftrace *node; - unsigned long flags; - unsigned long key; - int resched; - int atomic; - int cpu; - - if (!ftrace_enabled || ftrace_disabled) - return; - - resched = need_resched(); - preempt_disable_notrace(); - - /* - * We simply need to protect against recursion. - * Use the the raw version of smp_processor_id and not - * __get_cpu_var which can call debug hooks that can - * cause a recursive crash here. - */ - cpu = raw_smp_processor_id(); - per_cpu(ftrace_shutdown_disable_cpu, cpu)++; - if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) - goto out; - - if (unlikely(ftrace_record_suspend)) - goto out; - - key = hash_long(ip, FTRACE_HASHBITS); - - WARN_ON_ONCE(key >= FTRACE_HASHSIZE); - - if (ftrace_ip_in_hash(ip, key)) - goto out; - - atomic = irqs_disabled(); - - spin_lock_irqsave(&ftrace_shutdown_lock, flags); - - /* This ip may have hit the hash before the lock */ - if (ftrace_ip_in_hash(ip, key)) - goto out_unlock; - - node = ftrace_alloc_dyn_node(ip); - if (!node) - goto out_unlock; - - node->ip = ip; - - ftrace_add_hash(node, key); - - ftraced_trigger = 1; - - out_unlock: - spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); - out: - per_cpu(ftrace_shutdown_disable_cpu, cpu)--; - - /* prevent recursion with scheduler */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - -#define FTRACE_ADDR ((long)(ftrace_caller)) - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *old, unsigned char *new, int enable) -{ - unsigned long ip, fl; - - ip = rec->ip; - - if (ftrace_filtered && enable) { - /* - * If filtering is on: - * - * If this record is set to be filtered and - * is enabled then do nothing. - * - * If this record is set to be filtered and - * it is not enabled, enable it. - * - * If this record is not set to be filtered - * and it is not enabled do nothing. - * - * If this record is set not to trace then - * do nothing. - * - * If this record is set not to trace and - * it is enabled then disable it. - * - * If this record is not set to be filtered and - * it is enabled, disable it. - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | - FTRACE_FL_ENABLED); - - if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || - !fl || (fl == FTRACE_FL_NOTRACE)) - return 0; - - /* - * If it is enabled disable it, - * otherwise enable it! - */ - if (fl & FTRACE_FL_ENABLED) { - /* swap new and old */ - new = old; - old = ftrace_call_replace(ip, FTRACE_ADDR); - rec->flags &= ~FTRACE_FL_ENABLED; - } else { - new = ftrace_call_replace(ip, FTRACE_ADDR); - rec->flags |= FTRACE_FL_ENABLED; - } - } else { - - if (enable) { - /* - * If this record is set not to trace and is - * not enabled, do nothing. - */ - fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); - if (fl == FTRACE_FL_NOTRACE) - return 0; - - new = ftrace_call_replace(ip, FTRACE_ADDR); - } else - old = ftrace_call_replace(ip, FTRACE_ADDR); - - if (enable) { - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - rec->flags |= FTRACE_FL_ENABLED; - } else { - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } - } - - return ftrace_modify_code(ip, old, new); -} - -static void ftrace_replace_code(int enable) -{ - int i, failed; - unsigned char *new = NULL, *old = NULL; - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - if (enable) - old = ftrace_nop_replace(); - else - new = ftrace_nop_replace(); - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; - - /* don't modify code that has already faulted */ - if (rec->flags & FTRACE_FL_FAILED) - continue; - - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } - - failed = __ftrace_replace_code(rec, old, new, enable); - if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { - rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_del_hash(rec); - ftrace_free_rec(rec); - } - } - } - } -} - -static void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - -static int -ftrace_code_disable(struct dyn_ftrace *rec) -{ - unsigned long ip; - unsigned char *nop, *call; - int failed; - - ip = rec->ip; - - nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, MCOUNT_ADDR); - - failed = ftrace_modify_code(ip, call, nop); - if (failed) { - rec->flags |= FTRACE_FL_FAILED; - return 0; - } - return 1; -} - -static int __ftrace_update_code(void *ignore); - -static int __ftrace_modify_code(void *data) -{ - unsigned long addr; - int *command = data; - - if (*command & FTRACE_ENABLE_CALLS) { - /* - * Update any recorded ips now that we have the - * machine stopped - */ - __ftrace_update_code(NULL); - ftrace_replace_code(1); - tracing_on = 1; - } else if (*command & FTRACE_DISABLE_CALLS) { - ftrace_replace_code(0); - tracing_on = 0; - } - - if (*command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); - - if (*command & FTRACE_ENABLE_MCOUNT) { - addr = (unsigned long)ftrace_record_ip; - ftrace_mcount_set(&addr); - } else if (*command & FTRACE_DISABLE_MCOUNT) { - addr = (unsigned long)ftrace_stub; - ftrace_mcount_set(&addr); - } - - return 0; -} - -static void ftrace_run_update_code(int command) -{ - stop_machine(__ftrace_modify_code, &command, NULL); -} - -void ftrace_disable_daemon(void) -{ - /* Stop the daemon from calling kstop_machine */ - mutex_lock(&ftraced_lock); - ftraced_stop = 1; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - -void ftrace_enable_daemon(void) -{ - mutex_lock(&ftraced_lock); - ftraced_stop = 0; - mutex_unlock(&ftraced_lock); - - ftrace_force_update(); -} - -static ftrace_func_t saved_ftrace_func; - -static void ftrace_startup(void) -{ - int command = 0; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftraced_lock); - ftraced_suspend++; - if (ftraced_suspend == 1) - command |= FTRACE_ENABLE_CALLS; - - if (saved_ftrace_func != ftrace_trace_function) { - saved_ftrace_func = ftrace_trace_function; - command |= FTRACE_UPDATE_TRACE_FUNC; - } - - if (!command || !ftrace_enabled) - goto out; - - ftrace_run_update_code(command); - out: - mutex_unlock(&ftraced_lock); -} - -static void ftrace_shutdown(void) -{ - int command = 0; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftraced_lock); - ftraced_suspend--; - if (!ftraced_suspend) - command |= FTRACE_DISABLE_CALLS; - - if (saved_ftrace_func != ftrace_trace_function) { - saved_ftrace_func = ftrace_trace_function; - command |= FTRACE_UPDATE_TRACE_FUNC; - } - - if (!command || !ftrace_enabled) - goto out; - - ftrace_run_update_code(command); - out: - mutex_unlock(&ftraced_lock); -} - -static void ftrace_startup_sysctl(void) -{ - int command = FTRACE_ENABLE_MCOUNT; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftraced_lock); - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftraced_suspend is true if we want ftrace running */ - if (ftraced_suspend) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); -} - -static void ftrace_shutdown_sysctl(void) -{ - int command = FTRACE_DISABLE_MCOUNT; - - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftraced_lock); - /* ftraced_suspend is true if ftrace is running */ - if (ftraced_suspend) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); -} - -static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; -unsigned long ftrace_update_tot_cnt; - -static int __ftrace_update_code(void *ignore) -{ - int i, save_ftrace_enabled; - cycle_t start, stop; - struct dyn_ftrace *p; - struct hlist_node *t, *n; - struct hlist_head *head, temp_list; - - /* Don't be recording funcs now */ - ftrace_record_suspend++; - save_ftrace_enabled = ftrace_enabled; - ftrace_enabled = 0; - - start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - - /* No locks needed, the machine is stopped! */ - for (i = 0; i < FTRACE_HASHSIZE; i++) { - INIT_HLIST_HEAD(&temp_list); - head = &ftrace_hash[i]; - - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(p, t, n, head, node) { - /* Skip over failed records which have not been - * freed. */ - if (p->flags & FTRACE_FL_FAILED) - continue; - - /* Unconverted records are always at the head of the - * hash bucket. Once we encounter a converted record, - * simply skip over to the next bucket. Saves ftraced - * some processor cycles (ftrace does its bid for - * global warming :-p ). */ - if (p->flags & (FTRACE_FL_CONVERTED)) - break; - - /* Ignore updates to this record's mcount site. - * Reintroduce this record at the head of this - * bucket to attempt to "convert" it again if - * the kprobe on it is unregistered before the - * next run. */ - if (get_kprobe((void *)p->ip)) { - ftrace_del_hash(p); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, &temp_list); - freeze_record(p); - continue; - } else { - unfreeze_record(p); - } - - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else { - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(p->ip)) { - ftrace_del_hash(p); - ftrace_free_rec(p); - } - } - } - - hlist_for_each_entry_safe(p, t, n, &temp_list, node) { - hlist_del(&p->node); - INIT_HLIST_NODE(&p->node); - hlist_add_head(&p->node, head); - } - } - - stop = ftrace_now(raw_smp_processor_id()); - ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; - ftraced_trigger = 0; - - ftrace_enabled = save_ftrace_enabled; - ftrace_record_suspend--; - - return 0; -} - -static int ftrace_update_code(void) -{ - if (unlikely(ftrace_disabled) || - !ftrace_enabled || !ftraced_trigger) - return 0; - - stop_machine(__ftrace_update_code, NULL, NULL); - - return 1; -} - -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dyn_table_alloc(void) -{ - struct ftrace_page *pg; - int cnt; - int i; - - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; - - /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. - */ - - pg = ftrace_pages = ftrace_pages_start; - - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; - - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); - - /* If we fail, we'll try later anyway */ - if (!pg->next) - break; - - pg = pg->next; - } - - return 0; -} - -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), -}; - -#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ - -struct ftrace_iterator { - loff_t pos; - struct ftrace_page *pg; - unsigned idx; - unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; - unsigned filtered; -}; - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = NULL; - - (*pos)++; - - retry: - if (iter->idx >= iter->pg->index) { - if (iter->pg->next) { - iter->pg = iter->pg->next; - iter->idx = 0; - goto retry; - } - } else { - rec = &iter->pg->records[iter->idx++]; - if ((!(iter->flags & FTRACE_ITER_FAILURES) && - (rec->flags & FTRACE_FL_FAILED)) || - - ((iter->flags & FTRACE_ITER_FAILURES) && - (!(rec->flags & FTRACE_FL_FAILED) || - (rec->flags & FTRACE_FL_FREE))) || - - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !(rec->flags & FTRACE_FL_NOTRACE))) { - rec = NULL; - goto retry; - } - } - - iter->pos = *pos; - - return rec; -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - void *p = NULL; - loff_t l = -1; - - if (*pos != iter->pos) { - for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) - ; - } else { - l = *pos; - p = t_next(m, p, &l); - } - - return p; -} - -static void t_stop(struct seq_file *m, void *p) -{ -} - -static int t_show(struct seq_file *m, void *v) -{ - struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; - - if (!rec) - return 0; - - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); - - return 0; -} - -static struct seq_operations show_ftrace_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int -ftrace_avail_open(struct inode *inode, struct file *file) -{ - struct ftrace_iterator *iter; - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->pos = -1; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); - } - - return ret; -} - -int ftrace_avail_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter = m->private; - - seq_release(inode, file); - kfree(iter); - - return 0; -} - -static int -ftrace_failures_open(struct inode *inode, struct file *file) -{ - int ret; - struct seq_file *m; - struct ftrace_iterator *iter; - - ret = ftrace_avail_open(inode, file); - if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; - iter->flags = FTRACE_ITER_FAILURES; - } - - return ret; -} - - -static void ftrace_filter_reset(int enable) -{ - struct ftrace_page *pg; - struct dyn_ftrace *rec; - unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned i; - - /* keep kstop machine from running */ - preempt_disable(); - if (enable) - ftrace_filtered = 0; - pg = ftrace_pages_start; - while (pg) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; - if (rec->flags & FTRACE_FL_FAILED) - continue; - rec->flags &= ~type; - } - pg = pg->next; - } - preempt_enable(); -} - -static int -ftrace_regex_open(struct inode *inode, struct file *file, int enable) -{ - struct ftrace_iterator *iter; - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - mutex_lock(&ftrace_regex_lock); - if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) - ftrace_filter_reset(enable); - - if (file->f_mode & FMODE_READ) { - iter->pg = ftrace_pages_start; - iter->pos = -1; - iter->flags = enable ? FTRACE_ITER_FILTER : - FTRACE_ITER_NOTRACE; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = iter; - } else - kfree(iter); - } else - file->private_data = iter; - mutex_unlock(&ftrace_regex_lock); - - return ret; -} - -static int -ftrace_filter_open(struct inode *inode, struct file *file) -{ - return ftrace_regex_open(inode, file, 1); -} - -static int -ftrace_notrace_open(struct inode *inode, struct file *file) -{ - return ftrace_regex_open(inode, file, 0); -} - -static ssize_t -ftrace_regex_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - if (file->f_mode & FMODE_READ) - return seq_read(file, ubuf, cnt, ppos); - else - return -EPERM; -} - -static loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); - else - file->f_pos = ret = 1; - - return ret; -} - -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -static void -ftrace_match(unsigned char *buff, int len, int enable) -{ - char str[KSYM_SYMBOL_LEN]; - char *search = NULL; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type = MATCH_FULL; - unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned i, match = 0, search_len = 0; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - search = buff + i + 1; - type = MATCH_END_ONLY; - search_len = len - (i + 1); - } else { - if (type == MATCH_END_ONLY) { - type = MATCH_MIDDLE_ONLY; - } else { - match = i; - type = MATCH_FRONT_ONLY; - } - buff[i] = 0; - break; - } - } - } - - /* keep kstop machine from running */ - preempt_disable(); - if (enable) - ftrace_filtered = 1; - pg = ftrace_pages_start; - while (pg) { - for (i = 0; i < pg->index; i++) { - int matched = 0; - char *ptr; - - rec = &pg->records[i]; - if (rec->flags & FTRACE_FL_FAILED) - continue; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - switch (type) { - case MATCH_FULL: - if (strcmp(str, buff) == 0) - matched = 1; - break; - case MATCH_FRONT_ONLY: - if (memcmp(str, buff, match) == 0) - matched = 1; - break; - case MATCH_MIDDLE_ONLY: - if (strstr(str, search)) - matched = 1; - break; - case MATCH_END_ONLY: - ptr = strstr(str, search); - if (ptr && (ptr[search_len] == 0)) - matched = 1; - break; - } - if (matched) - rec->flags |= flag; - } - pg = pg->next; - } - preempt_enable(); -} - -static ssize_t -ftrace_regex_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos, int enable) -{ - struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; - - if (!cnt || cnt < 0) - return 0; - - mutex_lock(&ftrace_regex_lock); - - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - iter = m->private; - } else - iter = file->private_data; - - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - if (!(iter->flags & ~FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - if (isspace(ch)) { - file->f_pos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - if (isspace(ch)) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx, enable); - iter->buffer_idx = 0; - } else - iter->flags |= FTRACE_ITER_CONT; - - - file->f_pos += read; - - ret = read; - out: - mutex_unlock(&ftrace_regex_lock); - - return ret; -} - -static ssize_t -ftrace_filter_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return ftrace_regex_write(file, ubuf, cnt, ppos, 1); -} - -static ssize_t -ftrace_notrace_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return ftrace_regex_write(file, ubuf, cnt, ppos, 0); -} - -static void -ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) -{ - if (unlikely(ftrace_disabled)) - return; - - mutex_lock(&ftrace_regex_lock); - if (reset) - ftrace_filter_reset(enable); - if (buf) - ftrace_match(buf, len, enable); - mutex_unlock(&ftrace_regex_lock); -} - -/** - * ftrace_set_filter - set a function to filter on in ftrace - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Filters denote which functions should be enabled when tracing is enabled. - * If @buf is NULL and reset is set, all functions will be enabled for tracing. - */ -void ftrace_set_filter(unsigned char *buf, int len, int reset) -{ - ftrace_set_regex(buf, len, reset, 1); -} - -/** - * ftrace_set_notrace - set a function to not trace in ftrace - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Notrace Filters denote which functions should not be enabled when tracing - * is enabled. If @buf is NULL and reset is set, all functions will be enabled - * for tracing. - */ -void ftrace_set_notrace(unsigned char *buf, int len, int reset) -{ - ftrace_set_regex(buf, len, reset, 0); -} - -static int -ftrace_regex_release(struct inode *inode, struct file *file, int enable) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter; - - mutex_lock(&ftrace_regex_lock); - if (file->f_mode & FMODE_READ) { - iter = m->private; - - seq_release(inode, file); - } else - iter = file->private_data; - - if (iter->buffer_idx) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx, enable); - } - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (iter->filtered && ftraced_suspend && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - kfree(iter); - mutex_unlock(&ftrace_regex_lock); - return 0; -} - -static int -ftrace_filter_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 1); -} - -static int -ftrace_notrace_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 0); -} - -static ssize_t -ftraced_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - /* don't worry about races */ - char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; - int r = strlen(buf); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -ftraced_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - if (strncmp(buf, "enable", 6) == 0) - val = 1; - else if (strncmp(buf, "disable", 7) == 0) - val = 0; - else { - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - } - - if (val) - ftrace_enable_daemon(); - else - ftrace_disable_daemon(); - - filp->f_pos += cnt; - - return cnt; -} - -static struct file_operations ftrace_avail_fops = { - .open = ftrace_avail_open, - .read = seq_read, - .llseek = seq_lseek, - .release = ftrace_avail_release, -}; - -static struct file_operations ftrace_failures_fops = { - .open = ftrace_failures_open, - .read = seq_read, - .llseek = seq_lseek, - .release = ftrace_avail_release, -}; - -static struct file_operations ftrace_filter_fops = { - .open = ftrace_filter_open, - .read = ftrace_regex_read, - .write = ftrace_filter_write, - .llseek = no_llseek, - .release = ftrace_filter_release, -}; - -static struct file_operations ftrace_notrace_fops = { - .open = ftrace_notrace_open, - .read = ftrace_regex_read, - .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, - .release = ftrace_notrace_release, -}; - -static struct file_operations ftraced_fops = { - .open = tracing_open_generic, - .read = ftraced_read, - .write = ftraced_write, -}; - -/** - * ftrace_force_update - force an update to all recording ftrace functions - */ -int ftrace_force_update(void) -{ - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - - /* - * If ftraced_trigger is not set, then there is nothing - * to update. - */ - if (ftraced_trigger && !ftrace_update_code()) - ret = -EBUSY; - - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - return ret; -} - -static void ftrace_force_shutdown(void) -{ - struct task_struct *task; - int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; - - mutex_lock(&ftraced_lock); - task = ftraced_task; - ftraced_task = NULL; - ftraced_suspend = -1; - ftrace_run_update_code(command); - mutex_unlock(&ftraced_lock); - - if (task) - kthread_stop(task); -} - -static __init int ftrace_init_debugfs(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); - - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); - - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); - - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); - - entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, - NULL, &ftraced_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ftraced_enabled' entry\n"); - return 0; -} - -fs_initcall(ftrace_init_debugfs); - -static int __init ftrace_dynamic_init(void) -{ - struct task_struct *p; - unsigned long addr; - int ret; - - addr = (unsigned long)ftrace_record_ip; - - stop_machine(ftrace_dyn_arch_init, &addr, NULL); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) { - ret = (int)addr; - goto failed; - } - - ret = ftrace_dyn_table_alloc(); - if (ret) - goto failed; - - p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) { - ret = -1; - goto failed; - } - - last_ftrace_enabled = ftrace_enabled = 1; - ftraced_task = p; - - return 0; - - failed: - ftrace_disabled = 1; - return ret; -} - -core_initcall(ftrace_dynamic_init); -#else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) -# define ftrace_force_shutdown() do { } while (0) -#endif /* CONFIG_DYNAMIC_FTRACE */ - -/** - * ftrace_kill_atomic - kill ftrace from critical sections - * - * This function should be used by panic code. It stops ftrace - * but in a not so nice way. If you need to simply kill ftrace - * from a non-atomic section, use ftrace_kill. - */ -void ftrace_kill_atomic(void) -{ - ftrace_disabled = 1; - ftrace_enabled = 0; -#ifdef CONFIG_DYNAMIC_FTRACE - ftraced_suspend = -1; -#endif - clear_ftrace_function(); -} - -/** - * ftrace_kill - totally shutdown ftrace - * - * This is a safety measure. If something was detected that seems - * wrong, calling this function will keep ftrace from doing - * any more modifications, and updates. - * used when something went wrong. - */ -void ftrace_kill(void) -{ - mutex_lock(&ftrace_sysctl_lock); - ftrace_disabled = 1; - ftrace_enabled = 0; - - clear_ftrace_function(); - mutex_unlock(&ftrace_sysctl_lock); - - /* Try to totally disable ftrace */ - ftrace_force_shutdown(); -} - -/** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. - * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. - */ -int register_ftrace_function(struct ftrace_ops *ops) -{ - int ret; - - if (unlikely(ftrace_disabled)) - return -1; - - mutex_lock(&ftrace_sysctl_lock); - ret = __register_ftrace_function(ops); - ftrace_startup(); - mutex_unlock(&ftrace_sysctl_lock); - - return ret; -} - -/** - * unregister_ftrace_function - unresgister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) -{ - int ret; - - mutex_lock(&ftrace_sysctl_lock); - ret = __unregister_ftrace_function(ops); - ftrace_shutdown(); - mutex_unlock(&ftrace_sysctl_lock); - - return ret; -} - -int -ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - mutex_lock(&ftrace_sysctl_lock); - - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); - - if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) - goto out; - - last_ftrace_enabled = ftrace_enabled; - - if (ftrace_enabled) { - - ftrace_startup_sysctl(); - - /* we are starting ftrace again */ - if (ftrace_list != &ftrace_list_end) { - if (ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; - else - ftrace_trace_function = ftrace_list_func; - } - - } else { - /* stopping ftrace calls (just send to ftrace_stub) */ - ftrace_trace_function = ftrace_stub; - - ftrace_shutdown_sysctl(); - } - - out: - mutex_unlock(&ftrace_sysctl_lock); - return ret; -} -/* - * ring buffer based function tracer - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Originally taken from the RT patch by: - * Arnaldo Carvalho de Melo - * - * Based on code from the latency_tracer, that is: - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - -unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; -unsigned long __read_mostly tracing_thresh; - -static unsigned long __read_mostly tracing_nr_buffers; -static cpumask_t __read_mostly tracing_buffer_mask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu_mask(cpu, tracing_buffer_mask) - -static int trace_alloc_page(void); -static int trace_free_page(void); - -static int tracing_disabled = 1; - -static unsigned long tracing_pages_allocated; - -long -ns2usecs(cycle_t nsec) -{ - nsec += 500; - do_div(nsec, 1000); - return nsec; -} - -cycle_t ftrace_now(int cpu) -{ - return cpu_clock(cpu); -} - -/* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. - */ -static struct trace_array global_trace; - -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); - -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_data); - -/* tracer_enabled is used to toggle activation of a tracer */ -static int tracer_enabled = 1; - -/* function tracing enabled */ -int ftrace_function_enabled; - -/* - * trace_nr_entries is the number of entries that is allocated - * for a buffer. Note, the number of entries is always rounded - * to ENTRIES_PER_PAGE. - */ -static unsigned long trace_nr_entries = 65536UL; - -/* trace_types holds a link list of available tracers. */ -static struct tracer *trace_types __read_mostly; - -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; - -/* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. - */ -static int max_tracer_type_len; - -/* - * trace_types_lock is used to protect the trace_types list. - * This lock is also used to keep user access serialized. - * Accesses from userspace will grab this lock while userspace - * activities happen inside the kernel. - */ -static DEFINE_MUTEX(trace_types_lock); - -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - -/* trace_flags holds iter_ctrl options */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; - -static notrace void no_trace_init(struct trace_array *tr) -{ - int cpu; - - ftrace_function_enabled = 0; - if(tr->ctrl) - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); - tracer_enabled = 0; -} - -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", - .init = no_trace_init -}; - - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. - */ -void trace_wake_up(void) -{ - /* - * The runqueue_is_locked() can fail, but this is the best we - * have for now: - */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) - wake_up(&trace_wait); -} - -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) - -static int __init set_nr_entries(char *str) -{ - unsigned long nr_entries; - int ret; - - if (!str) - return 0; - ret = strict_strtoul(str, 0, &nr_entries); - /* nr_entries can not be zero */ - if (ret < 0 || nr_entries == 0) - return 0; - trace_nr_entries = nr_entries; - return 1; -} -__setup("trace_entries=", set_nr_entries); - -unsigned long nsecs_to_usecs(unsigned long nsecs) -{ - return nsecs / 1000; -} - -/* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, -}; - -/* - * TRACE_ITER_SYM_MASK masks the options in trace_flags that - * control the output of kernel symbols. - */ -#define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) - -/* These must match the bit postions in trace_iterator_flags */ -static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "sched-tree", - NULL -}; - -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a raw_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /debugfs/tracing/latency_trace) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data = tr->data[cpu]; - - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; - - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; - - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = tsk->uid; - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(current); -} - -#define CHECK_COND(cond) \ - if (unlikely(cond)) { \ - tracing_disabled = 1; \ - WARN_ON(1); \ - return -1; \ - } - -/** - * check_pages - integrity check of trace buffers - * - * As a safty measure we check to make sure the data pages have not - * been corrupted. - */ -int check_pages(struct trace_array_cpu *data) -{ - struct page *page, *tmp; - - CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); - CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); - - list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - CHECK_COND(page->lru.next->prev != &page->lru); - CHECK_COND(page->lru.prev->next != &page->lru); - } - - return 0; -} - -/** - * head_page - page address of the first page in per_cpu buffer. - * - * head_page returns the page address of the first page in - * a per_cpu buffer. This also preforms various consistency - * checks to make sure the buffer has not been corrupted. - */ -void *head_page(struct trace_array_cpu *data) -{ - struct page *page; - - if (list_empty(&data->trace_pages)) - return NULL; - - page = list_entry(data->trace_pages.next, struct page, lru); - BUG_ON(&page->lru == &data->trace_pages); - - return page_address(page); -} - -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (!len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) - return 0; - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -static int -trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -static int -trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->len >= (PAGE_SIZE - 1)) - return 0; - - s->buffer[s->len++] = c; - - return 1; -} - -static int -trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) -{ - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -#define HEX_CHARS 17 -static const char hex2asc[] = "0123456789abcdef"; - -static int -trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - unsigned char *data = mem; - unsigned char byte; - int i, j; - - BUG_ON(len >= HEX_CHARS); - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - byte = data[i]; - - hex[j++] = hex2asc[byte & 0x0f]; - hex[j++] = hex2asc[byte >> 4]; - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -static void -trace_seq_reset(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ - int len; - int ret; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret) - return -EFAULT; - - s->readpos += len; - return cnt; -} - -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_reset(s); -} - -/* - * flip the trace buffers between two trace descriptors. - * This usually is the buffers between the global_trace and - * the max_tr to record a snapshot of a current trace. - * - * The ftrace_max_lock must be held. - */ -static void -flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) -{ - struct list_head flip_pages; - - INIT_LIST_HEAD(&flip_pages); - - memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, - sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_head_idx)); - - check_pages(tr1); - check_pages(tr2); - list_splice_init(&tr1->trace_pages, &flip_pages); - list_splice_init(&tr2->trace_pages, &tr1->trace_pages); - list_splice_init(&flip_pages, &tr2->trace_pages); - BUG_ON(!list_empty(&flip_pages)); - check_pages(tr1); - check_pages(tr2); -} - -/** - * update_max_tr - snapshot all trace buffers from global_trace to max_tr - * @tr: tracer - * @tsk: the task with the latency - * @cpu: The cpu that initiated the trace. - * - * Flip the buffers between the @tr and the max_tr and record information - * about which task was the cause of this latency. - */ -void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data; - int i; - - WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); - /* clear out all the previous traces */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - flip_trace(max_tr.data[i], data); - tracing_reset(data); - } - - __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); -} - -/** - * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. - * - * Flip the trace of a single CPU buffer between the @tr and the max_tr. - */ -void -update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data = tr->data[cpu]; - int i; - - WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(i) - tracing_reset(max_tr.data[i]); - - flip_trace(max_tr.data[cpu], data); - tracing_reset(data); - - __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); -} - -/** - * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer - * - * Register a new plugin tracer. - */ -int register_tracer(struct tracer *type) -{ - struct tracer *t; - int len; - int ret = 0; - - if (!type->name) { - pr_info("Tracer must have a name\n"); - return -1; - } - - mutex_lock(&trace_types_lock); - for (t = trace_types; t; t = t->next) { - if (strcmp(type->name, t->name) == 0) { - /* already found */ - pr_info("Trace %s already registered\n", - type->name); - ret = -1; - goto out; - } - } - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest) { - struct tracer *saved_tracer = current_trace; - struct trace_array_cpu *data; - struct trace_array *tr = &global_trace; - int saved_ctrl = tr->ctrl; - int i; - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); - } - current_trace = type; - tr->ctrl = 0; - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - current_trace = saved_tracer; - tr->ctrl = saved_ctrl; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); - } - printk(KERN_CONT "PASSED\n"); - } -#endif - - type->next = trace_types; - trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - - out: - mutex_unlock(&trace_types_lock); - - return ret; -} - -void unregister_tracer(struct tracer *type) -{ - struct tracer **t; - int len; - - mutex_lock(&trace_types_lock); - for (t = &trace_types; *t; t = &(*t)->next) { - if (*t == type) - goto found; - } - pr_info("Trace %s not registered\n", type->name); - goto out; - - found: - *t = (*t)->next; - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: - mutex_unlock(&trace_types_lock); -} - -void tracing_reset(struct trace_array_cpu *data) -{ - data->trace_idx = 0; - data->overrun = 0; - data->trace_head = data->trace_tail = head_page(data); - data->trace_head_idx = 0; - data->trace_tail_idx = 0; -} - -#define SAVED_CMDLINES 128 -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; -static DEFINE_SPINLOCK(trace_cmdline_lock); - -/* temporary disable recording */ -atomic_t trace_record_cmdline_disabled __read_mostly; - -static void trace_init_cmdlines(void) -{ - memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; -} - -void trace_stop_cmdline_recording(void); - -static void trace_save_cmdline(struct task_struct *tsk) -{ - unsigned map; - unsigned idx; - - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - */ - if (!spin_trylock(&trace_cmdline_lock)) - return; - - idx = map_pid_to_cmdline[tsk->pid]; - if (idx >= SAVED_CMDLINES) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; - - map = map_cmdline_to_pid[idx]; - if (map <= PID_MAX_DEFAULT) - map_pid_to_cmdline[map] = (unsigned)-1; - - map_pid_to_cmdline[tsk->pid] = idx; - - cmdline_idx = idx; - } - - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - - spin_unlock(&trace_cmdline_lock); -} - -static char *trace_find_cmdline(int pid) -{ - char *cmdline = "<...>"; - unsigned map; - - if (!pid) - return ""; - - if (pid > PID_MAX_DEFAULT) - goto out; - - map = map_pid_to_cmdline[pid]; - if (map >= SAVED_CMDLINES) - goto out; - - cmdline = saved_cmdlines[map]; - - out: - return cmdline; -} - -void tracing_record_cmdline(struct task_struct *tsk) -{ - if (atomic_read(&trace_record_cmdline_disabled)) - return; - - trace_save_cmdline(tsk); -} - -static inline struct list_head * -trace_next_list(struct trace_array_cpu *data, struct list_head *next) -{ - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = next->next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - return next; -} - -static inline void * -trace_next_page(struct trace_array_cpu *data, void *addr) -{ - struct list_head *next; - struct page *page; - - page = virt_to_page(addr); - - next = trace_next_list(data, &page->lru); - page = list_entry(next, struct page, lru); - - return page_address(page); -} - -static inline struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) -{ - unsigned long idx, idx_next; - struct trace_entry *entry; - - data->trace_idx++; - idx = data->trace_head_idx; - idx_next = idx + 1; - - BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - - entry = data->trace_head + idx * TRACE_ENTRY_SIZE; - - if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - data->trace_head = trace_next_page(data, data->trace_head); - idx_next = 0; - } - - if (data->trace_head == data->trace_tail && - idx_next == data->trace_tail_idx) { - /* overrun */ - data->overrun++; - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = - trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - } - - data->trace_head_idx = idx_next; - - return entry; -} - -static inline void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) -{ - struct task_struct *tsk = current; - unsigned long pc; - - pc = preempt_count(); - - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->t = ftrace_now(raw_smp_processor_id()); - entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); -} - -void -trace_function(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); -} - -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, data, ip, parent_ip, flags); -} - -#ifdef CONFIG_MMIOTRACE -void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_rw *rw) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->mmiorw = *rw; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_map *map) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->mmiomap = *map; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} -#endif - -void __trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags, - int skip) -{ - struct trace_entry *entry; - struct stack_trace trace; - - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_STACK; - - memset(&entry->stack, 0, sizeof(entry->stack)); - - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->stack.caller; - - save_stack_trace(&trace); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array_cpu *data = __data; - struct trace_array *tr = __tr; - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; - __trace_stack(tr, data, irq_flags, 4); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct trace_array_cpu *data, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_CTX; - entry->ctx.prev_pid = prev->pid; - entry->ctx.prev_prio = prev->prio; - entry->ctx.prev_state = prev->state; - entry->ctx.next_pid = next->pid; - entry->ctx.next_prio = next->prio; - entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 5); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct trace_array_cpu *data, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags) -{ - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_WAKE; - entry->ctx.prev_pid = curr->pid; - entry->ctx.prev_prio = curr->prio; - entry->ctx.prev_state = curr->state; - entry->ctx.next_pid = wakee->pid; - entry->ctx.next_prio = wakee->prio; - entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 6); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -#ifdef CONFIG_FTRACE -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (unlikely(!ftrace_function_enabled)) - return; - - if (skip_trace(ip)) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, -}; - -void tracing_start_function_trace(void) -{ - ftrace_function_enabled = 0; - register_ftrace_function(&trace_ops); - if (tracer_enabled) - ftrace_function_enabled = 1; -} - -void tracing_stop_function_trace(void) -{ - ftrace_function_enabled = 0; - unregister_ftrace_function(&trace_ops); -} -#endif - -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, -}; - -static struct trace_entry * -trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, - struct trace_iterator *iter, int cpu) -{ - struct page *page; - struct trace_entry *array; - - if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx || - (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx)) - return NULL; - - if (!iter->next_page[cpu]) { - /* Initialize the iterator for this cpu trace buffer */ - WARN_ON(!data->trace_tail); - page = virt_to_page(data->trace_tail); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_tail_idx; - } - - page = list_entry(iter->next_page[cpu], struct page, lru); - BUG_ON(&data->trace_pages == &page->lru); - - array = page_address(page); - - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); - return &array[iter->next_page_idx[cpu]]; -} - -static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) -{ - struct trace_array *tr = iter->tr; - struct trace_entry *ent, *next = NULL; - int next_cpu = -1; - int cpu; - - for_each_tracing_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); - /* - * Pick the entry with the smallest timestamp: - */ - if (ent && (!next || ent->t < next->t)) { - next = ent; - next_cpu = cpu; - } - } - - if (ent_cpu) - *ent_cpu = next_cpu; - - return next; -} - -static void trace_iterator_increment(struct trace_iterator *iter) -{ - iter->idx++; - iter->next_idx[iter->cpu]++; - iter->next_page_idx[iter->cpu]++; - - if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - - iter->next_page_idx[iter->cpu] = 0; - iter->next_page[iter->cpu] = - trace_next_list(data, iter->next_page[iter->cpu]); - } -} - -static void trace_consume(struct trace_iterator *iter) -{ - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - - /* Check if we empty it, then reset the index */ - if (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx) - data->trace_idx = 0; -} - -static void *find_next_entry_inc(struct trace_iterator *iter) -{ - struct trace_entry *next; - int next_cpu = -1; - - next = find_next_entry(iter, &next_cpu); - - iter->prev_ent = iter->ent; - iter->prev_cpu = iter->cpu; - - iter->ent = next; - iter->cpu = next_cpu; - - if (next) - trace_iterator_increment(iter); - - return next ? iter : NULL; -} - -static void *s_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_iterator *iter = m->private; - int i = (int)*pos; - void *ent; - - (*pos)++; - - /* can't go backwards */ - if (iter->idx > i) - return NULL; - - if (iter->idx < 0) - ent = find_next_entry_inc(iter); - else - ent = iter; - - while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); - - iter->pos = *pos; - - return ent; -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - struct trace_iterator *iter = m->private; - void *p = NULL; - loff_t l = 0; - int i; - - mutex_lock(&trace_types_lock); - - if (!current_trace || current_trace != iter->trace) { - mutex_unlock(&trace_types_lock); - return NULL; - } - - atomic_inc(&trace_record_cmdline_disabled); - - /* let the tracer grab locks here if needed */ - if (current_trace->start) - current_trace->start(iter); - - if (*pos != iter->pos) { - iter->ent = NULL; - iter->cpu = 0; - iter->idx = -1; - iter->prev_ent = NULL; - iter->prev_cpu = -1; - - for_each_tracing_cpu(i) { - iter->next_idx[i] = 0; - iter->next_page[i] = NULL; - } - - for (p = iter; p && l < *pos; p = s_next(m, p, &l)) - ; - - } else { - l = *pos - 1; - p = s_next(m, p, &l); - } - - return p; -} - -static void s_stop(struct seq_file *m, void *p) -{ - struct trace_iterator *iter = m->private; - - atomic_dec(&trace_record_cmdline_disabled); - - /* let the tracer release locks here if needed */ - if (current_trace && current_trace == iter->trace && iter->trace->stop) - iter->trace->stop(iter); - - mutex_unlock(&trace_types_lock); -} - -#define KRETPROBE_MSG "[unknown/kretprobe'd]" - -#ifdef CONFIG_KRETPROBES -static inline int kretprobed(unsigned long addr) -{ - return addr == (unsigned long)kretprobe_trampoline; -} -#else -static inline int kretprobed(unsigned long addr) -{ - return 0; -} -#endif /* CONFIG_KRETPROBES */ - -static int -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - - kallsyms_lookup(address, NULL, NULL, NULL, str); - - return trace_seq_printf(s, fmt, str); -#endif - return 1; -} - -static int -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - - sprint_symbol(str, address); - return trace_seq_printf(s, fmt, str); -#endif - return 1; -} - -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - -static int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) -{ - int ret; - - if (!ip) - return trace_seq_printf(s, "0"); - - if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); - else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; - - if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -static void print_lat_help_header(struct seq_file *m) -{ - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); -} - -static void print_func_help_header(struct seq_file *m) -{ - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); -} - - -static void -print_trace_header(struct seq_file *m, struct trace_iterator *iter) -{ - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; - unsigned long total = 0; - unsigned long entries = 0; - int cpu; - const char *name = "preemption"; - - if (type) - name = type->name; - - for_each_tracing_cpu(cpu) { - if (head_page(tr->data[cpu])) { - total += tr->data[cpu]->trace_idx; - if (tr->data[cpu]->trace_idx > tr->entries) - entries += tr->entries; - else - entries += tr->data[cpu]->trace_idx; - } - } - - seq_printf(m, "%s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); - seq_puts(m, "-----------------------------------" - "---------------------------------\n"); - seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" - " (M:%s VP:%d, KP:%d, SP:%d HP:%d", - nsecs_to_usecs(data->saved_latency), - entries, - total, - tr->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#else - "unknown", -#endif - /* These are reserved for later use */ - 0, 0, 0, 0); -#ifdef CONFIG_SMP - seq_printf(m, " #P:%d)\n", num_online_cpus()); -#else - seq_puts(m, ")\n"); -#endif - seq_puts(m, " -----------------\n"); - seq_printf(m, " | task: %.16s-%d " - "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, - data->policy, data->rt_priority); - seq_puts(m, " -----------------\n"); - - if (data->critical_start) { - seq_puts(m, " => started at: "); - seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); - trace_print_seq(m, &iter->seq); - seq_puts(m, "\n => ended at: "); - seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); - trace_print_seq(m, &iter->seq); - seq_puts(m, "\n"); - } - - seq_puts(m, "\n"); -} - -static void -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) -{ - int hardirq, softirq; - char *comm; - - comm = trace_find_cmdline(entry->pid); - - trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%d", cpu); - trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) { - trace_seq_putc(s, 'H'); - } else { - if (hardirq) { - trace_seq_putc(s, 'h'); - } else { - if (softirq) - trace_seq_putc(s, 's'); - else - trace_seq_putc(s, '.'); - } - } - - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); - else - trace_seq_puts(s, "."); -} - -unsigned long preempt_mark_thresh = 100; - -static void -lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, - unsigned long rel_usecs) -{ - trace_seq_printf(s, " %4lldus", abs_usecs); - if (rel_usecs > preempt_mark_thresh) - trace_seq_puts(s, "!: "); - else if (rel_usecs > 1) - trace_seq_puts(s, "+: "); - else - trace_seq_puts(s, " : "); -} - -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int -print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) -{ - struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry = find_next_entry(iter, NULL); - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); - struct trace_entry *entry = iter->ent; - unsigned long abs_usecs; - unsigned long rel_usecs; - char *comm; - int S, T; - int i; - unsigned state; - - if (!next_entry) - next_entry = entry; - rel_usecs = ns2usecs(next_entry->t - entry->t); - abs_usecs = ns2usecs(entry->t - iter->tr->time_start); - - if (verbose) { - comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(entry->t), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, - rel_usecs % 1000); - } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); - } - switch (entry->type) { - case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); - trace_seq_puts(s, " ("); - if (kretprobed(entry->fn.parent_ip)) - trace_seq_puts(s, KRETPROBE_MSG); - else - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); - trace_seq_puts(s, ")\n"); - break; - case TRACE_CTX: - case TRACE_WAKE: - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - - state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; - comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, - T, comm); - break; - case TRACE_SPECIAL: - trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); - break; - case TRACE_STACK: - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) - trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); - } - trace_seq_puts(s, "\n"); - break; - default: - trace_seq_printf(s, "Unknown type %d\n", entry->type); - } - return 1; -} - -static int print_trace_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *entry; - unsigned long usec_rem; - unsigned long long t; - unsigned long secs; - char *comm; - int ret; - int S, T; - int i; - - entry = iter->ent; - - comm = trace_find_cmdline(iter->ent->pid); - - t = ns2usecs(entry->t); - usec_rem = do_div(t, 1000000ULL); - secs = (unsigned long)t; - - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; - - switch (entry->type) { - case TRACE_FN: - ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); - if (!ret) - return 0; - if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - entry->fn.parent_ip) { - ret = trace_seq_printf(s, " <-"); - if (!ret) - return 0; - if (kretprobed(entry->fn.parent_ip)) - ret = trace_seq_puts(s, KRETPROBE_MSG); - else - ret = seq_print_ip_sym(s, entry->fn.parent_ip, - sym_flags); - if (!ret) - return 0; - } - ret = trace_seq_printf(s, "\n"); - if (!ret) - return 0; - break; - case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, - T); - if (!ret) - return 0; - break; - case TRACE_SPECIAL: - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); - if (!ret) - return 0; - break; - case TRACE_STACK: - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) { - ret = trace_seq_puts(s, " <= "); - if (!ret) - return 0; - } - ret = seq_print_ip_sym(s, entry->stack.caller[i], - sym_flags); - if (!ret) - return 0; - } - ret = trace_seq_puts(s, "\n"); - if (!ret) - return 0; - break; - } - return 1; -} - -static int print_raw_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry; - int ret; - int S, T; - - entry = iter->ent; - - ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, entry->t); - if (!ret) - return 0; - - switch (entry->type) { - case TRACE_FN: - ret = trace_seq_printf(s, "%x %x\n", - entry->fn.ip, entry->fn.parent_ip); - if (!ret) - return 0; - break; - case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio, - T); - if (!ret) - return 0; - break; - case TRACE_SPECIAL: - case TRACE_STACK: - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); - if (!ret) - return 0; - break; - } - return 1; -} - -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - -static int print_hex_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - unsigned char newline = '\n'; - struct trace_entry *entry; - int S, T; - - entry = iter->ent; - - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, entry->t); - - switch (entry->type) { - case TRACE_FN: - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); - break; - case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); - SEQ_PUT_HEX_FIELD_RET(s, T); - break; - case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); - break; - } - SEQ_PUT_FIELD_RET(s, newline); - - return 1; -} - -static int print_bin_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry; - - entry = iter->ent; - - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, entry->t); - - switch (entry->type) { - case TRACE_FN: - SEQ_PUT_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); - break; - case TRACE_CTX: - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); - break; - case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_FIELD_RET(s, entry->special.arg3); - break; - } - return 1; -} - -static int trace_empty(struct trace_iterator *iter) -{ - struct trace_array_cpu *data; - int cpu; - - for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (head_page(data) && data->trace_idx && - (data->trace_tail != data->trace_head || - data->trace_tail_idx != data->trace_head_idx)) - return 0; - } - return 1; -} - -static int print_trace_line(struct trace_iterator *iter) -{ - if (iter->trace && iter->trace->print_line) - return iter->trace->print_line(iter); - - if (trace_flags & TRACE_ITER_BIN) - return print_bin_fmt(iter); - - if (trace_flags & TRACE_ITER_HEX) - return print_hex_fmt(iter); - - if (trace_flags & TRACE_ITER_RAW) - return print_raw_fmt(iter); - - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - return print_lat_fmt(iter, iter->idx, iter->cpu); - - return print_trace_fmt(iter); -} - -static int s_show(struct seq_file *m, void *v) -{ - struct trace_iterator *iter = v; - - if (iter->ent == NULL) { - if (iter->tr) { - seq_printf(m, "# tracer: %s\n", iter->trace->name); - seq_puts(m, "#\n"); - } - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return 0; - print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); - } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); - } - } else { - print_trace_line(iter); - trace_print_seq(m, &iter->seq); - } - - return 0; -} - -static struct seq_operations tracer_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file, int *ret) -{ - struct trace_iterator *iter; - - if (tracing_disabled) { - *ret = -ENODEV; - return NULL; - } - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - *ret = -ENOMEM; - goto out; - } - - mutex_lock(&trace_types_lock); - if (current_trace && current_trace->print_max) - iter->tr = &max_tr; - else - iter->tr = inode->i_private; - iter->trace = current_trace; - iter->pos = -1; - - /* TODO stop tracer */ - *ret = seq_open(file, &tracer_seq_ops); - if (!*ret) { - struct seq_file *m = file->private_data; - m->private = iter; - - /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } - - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - } else { - kfree(iter); - iter = NULL; - } - mutex_unlock(&trace_types_lock); - - out: - return iter; -} - -int tracing_open_generic(struct inode *inode, struct file *filp) -{ - if (tracing_disabled) - return -ENODEV; - - filp->private_data = inode->i_private; - return 0; -} - -int tracing_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct trace_iterator *iter = m->private; - - mutex_lock(&trace_types_lock); - if (iter->trace && iter->trace->close) - iter->trace->close(iter); - - /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) { - tracer_enabled = 1; - /* - * It is safe to enable function tracing even if it - * isn't used - */ - ftrace_function_enabled = 1; - } - mutex_unlock(&trace_types_lock); - - seq_release(inode, file); - kfree(iter); - return 0; -} - -static int tracing_open(struct inode *inode, struct file *file) -{ - int ret; - - __tracing_open(inode, file, &ret); - - return ret; -} - -static int tracing_lt_open(struct inode *inode, struct file *file) -{ - struct trace_iterator *iter; - int ret; - - iter = __tracing_open(inode, file, &ret); - - if (!ret) - iter->iter_flags |= TRACE_FILE_LAT_FMT; - - return ret; -} - - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct tracer *t = m->private; - - (*pos)++; - - if (t) - t = t->next; - - m->private = t; - - return t; -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - struct tracer *t = m->private; - loff_t l = 0; - - mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; -} - -static void t_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&trace_types_lock); -} - -static int t_show(struct seq_file *m, void *v) -{ - struct tracer *t = v; - - if (!t) - return 0; - - seq_printf(m, "%s", t->name); - if (t->next) - seq_putc(m, ' '); - else - seq_putc(m, '\n'); - - return 0; -} - -static struct seq_operations show_traces_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int show_traces_open(struct inode *inode, struct file *file) -{ - int ret; - - if (tracing_disabled) - return -ENODEV; - - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; -} - -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) -{ - if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); - else - return 0; -} - -static struct file_operations tracing_fops = { - .open = tracing_open, - .read = seq_read, - .llseek = tracing_seek, - .release = tracing_release, -}; - -static struct file_operations tracing_lt_fops = { - .open = tracing_lt_open, - .read = seq_read, - .llseek = tracing_seek, - .release = tracing_release, -}; - -static struct file_operations show_traces_fops = { - .open = show_traces_open, - .read = seq_read, - .release = seq_release, -}; - -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_t tracing_cpumask = CPU_MASK_ALL; - -/* - * When tracing/tracing_cpu_mask is modified then this holds - * the new bitmask we are about to install: - */ -static cpumask_t tracing_cpumask_new; - -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - -static ssize_t -tracing_cpumask_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - int len; - - mutex_lock(&tracing_cpumask_update_lock); - - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); - if (count - len < 2) { - count = -EINVAL; - goto out_err; - } - len += sprintf(mask_str + len, "\n"); - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); - -out_err: - mutex_unlock(&tracing_cpumask_update_lock); - - return count; -} - -static ssize_t -tracing_cpumask_write(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - int err, cpu; - - mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); - if (err) - goto err_unlock; - - raw_local_irq_disable(); - __raw_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(cpu) { - /* - * Increase/decrease the disabled counter if we are - * about to flip a bit in the cpumask: - */ - if (cpu_isset(cpu, tracing_cpumask) && - !cpu_isset(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); - } - if (!cpu_isset(cpu, tracing_cpumask) && - cpu_isset(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); - } - } - __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_enable(); - - tracing_cpumask = tracing_cpumask_new; - - mutex_unlock(&tracing_cpumask_update_lock); - - return count; - -err_unlock: - mutex_unlock(&tracing_cpumask_update_lock); - - return err; -} - -static struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, - .read = tracing_cpumask_read, - .write = tracing_cpumask_write, -}; - -static ssize_t -tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; - int i; - - /* calulate max size */ - for (i = 0; trace_options[i]; i++) { - len += strlen(trace_options[i]); - len += 3; /* "no" and space */ - } - - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s ", trace_options[i]); - else - r += sprintf(buf + r, "no%s ", trace_options[i]); - } - - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; -} - -static ssize_t -tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp = buf; - int neg = 0; - int i; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - if (strncmp(buf, "no", 2) == 0) { - neg = 1; - cmp += 2; - } - - for (i = 0; trace_options[i]; i++) { - int len = strlen(trace_options[i]); - - if (strncmp(cmp, trace_options[i], len) == 0) { - if (neg) - trace_flags &= ~(1 << i); - else - trace_flags |= (1 << i); - break; - } - } - /* - * If no option could be set, return an error: - */ - if (!trace_options[i]) - return -EINVAL; - - filp->f_pos += cnt; - - return cnt; -} - -static struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, -}; - -static const char readme_msg[] = - "tracing mini-HOWTO:\n\n" - "# mkdir /debug\n" - "# mount -t debugfs nodev /debug\n\n" - "# cat /debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" - "# cat /debug/tracing/current_tracer\n" - "none\n" - "# echo sched_switch > /debug/tracing/current_tracer\n" - "# cat /debug/tracing/current_tracer\n" - "sched_switch\n" - "# cat /debug/tracing/iter_ctrl\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /debug/tracing/iter_ctrl\n" - "# echo 1 > /debug/tracing/tracing_enabled\n" - "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" -; - -static ssize_t -tracing_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static struct file_operations tracing_readme_fops = { - .open = tracing_open_generic, - .read = tracing_readme_read, -}; - -static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - char buf[64]; - int r; - - r = sprintf(buf, "%ld\n", tr->ctrl); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - char buf[64]; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - - mutex_lock(&trace_types_lock); - if (tr->ctrl ^ val) { - if (val) - tracer_enabled = 1; - else - tracer_enabled = 0; - - tr->ctrl = val; - - if (current_trace && current_trace->ctrl_update) - current_trace->ctrl_update(tr); - } - mutex_unlock(&trace_types_lock); - - filp->f_pos += cnt; - - return cnt; -} - -static ssize_t -tracing_set_trace_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[max_tracer_type_len+2]; - int r; - - mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); - mutex_unlock(&trace_types_lock); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = &global_trace; - struct tracer *t; - char buf[max_tracer_type_len+1]; - int i; - - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; - - mutex_lock(&trace_types_lock); - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) - break; - } - if (!t || t == current_trace) - goto out; - - if (current_trace && current_trace->reset) - current_trace->reset(tr); - - current_trace = t; - if (t->init) - t->init(tr); - - out: - mutex_unlock(&trace_types_lock); - - filp->f_pos += cnt; - - return cnt; -} - -static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *ptr = filp->private_data; - char buf[64]; - int r; - - r = snprintf(buf, sizeof(buf), "%ld\n", - *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); - if (r > sizeof(buf)) - r = sizeof(buf); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - long *ptr = filp->private_data; - char buf[64]; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - *ptr = val * 1000; - - return cnt; -} - -static atomic_t tracing_reader; - -static int tracing_open_pipe(struct inode *inode, struct file *filp) -{ - struct trace_iterator *iter; - - if (tracing_disabled) - return -ENODEV; - - /* We only allow for reader of the pipe */ - if (atomic_inc_return(&tracing_reader) != 1) { - atomic_dec(&tracing_reader); - return -EBUSY; - } - - /* create a buffer to store the information to pass to userspace */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - mutex_lock(&trace_types_lock); - iter->tr = &global_trace; - iter->trace = current_trace; - filp->private_data = iter; - - if (iter->trace->pipe_open) - iter->trace->pipe_open(iter); - mutex_unlock(&trace_types_lock); - - return 0; -} - -static int tracing_release_pipe(struct inode *inode, struct file *file) -{ - struct trace_iterator *iter = file->private_data; - - kfree(iter); - atomic_dec(&tracing_reader); - - return 0; -} - -static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) -{ - struct trace_iterator *iter = filp->private_data; - - if (trace_flags & TRACE_ITER_BLOCK) { - /* - * Always select as readable when in blocking mode - */ - return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - - return 0; - } -} - -/* - * Consumer reader. - */ -static ssize_t -tracing_read_pipe(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_iterator *iter = filp->private_data; - struct trace_array_cpu *data; - static cpumask_t mask; - unsigned long flags; -#ifdef CONFIG_FTRACE - int ftrace_save; -#endif - int cpu; - ssize_t sret; - - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - sret = 0; - - trace_seq_reset(&iter->seq); - - mutex_lock(&trace_types_lock); - if (iter->trace->read) { - sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); - if (sret) - goto out; - } - - while (trace_empty(iter)) { - - if ((filp->f_flags & O_NONBLOCK)) { - sret = -EAGAIN; - goto out; - } - - /* - * This is a make-shift waitqueue. The reason we don't use - * an actual wait queue is because: - * 1) we only ever have one waiter - * 2) the tracing, traces all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * Anyway, this is really very primitive wakeup. - */ - set_current_state(TASK_INTERRUPTIBLE); - iter->tr->waiter = current; - - mutex_unlock(&trace_types_lock); - - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ/10); - - mutex_lock(&trace_types_lock); - - iter->tr->waiter = NULL; - - if (signal_pending(current)) { - sret = -EINTR; - goto out; - } - - if (iter->trace != current_trace) - goto out; - - /* - * We block until we read something and tracing is disabled. - * We still block if tracing is disabled, but we have never - * read anything. This allows a user to cat this file, and - * then enable tracing. But after we have read something, - * we give an EOF when tracing is again disabled. - * - * iter->pos will be 0 if we haven't read anything. - */ - if (!tracer_enabled && iter->pos) - break; - - continue; - } - - /* stop when tracing is finished */ - if (trace_empty(iter)) - goto out; - - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; - - /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); - iter->pos = -1; - - /* - * We need to stop all tracing on all CPUS to read the - * the next buffer. This is a bit expensive, but is - * not done often. We fill all what we can read, - * and then release the locks again. - */ - - cpus_clear(mask); - local_irq_save(flags); -#ifdef CONFIG_FTRACE - ftrace_save = ftrace_enabled; - ftrace_enabled = 0; -#endif - smp_wmb(); - for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (!head_page(data) || !data->trace_idx) - continue; - - atomic_inc(&data->disabled); - cpu_set(cpu, mask); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_lock(&data->lock); - - if (data->overrun > iter->last_overrun[cpu]) - iter->overrun[cpu] += - data->overrun - iter->last_overrun[cpu]; - iter->last_overrun[cpu] = data->overrun; - } - - while (find_next_entry_inc(iter) != NULL) { - int ret; - int len = iter->seq.len; - - ret = print_trace_line(iter); - if (!ret) { - /* don't print partial lines */ - iter->seq.len = len; - break; - } - - trace_consume(iter); - - if (iter->seq.len >= cnt) - break; - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_unlock(&data->lock); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - atomic_dec(&data->disabled); - } -#ifdef CONFIG_FTRACE - ftrace_enabled = ftrace_save; -#endif - local_irq_restore(flags); - - /* Now copy what we have to the user */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) - trace_seq_reset(&iter->seq); - if (sret == -EBUSY) - sret = 0; - -out: - mutex_unlock(&trace_types_lock); - - return sret; -} - -static ssize_t -tracing_entries_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - char buf[64]; - int r; - - r = sprintf(buf, "%lu\n", tr->entries); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_entries_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - char buf[64]; - int i, ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - /* must have at least 1 entry */ - if (!val) - return -EINVAL; - - mutex_lock(&trace_types_lock); - - if (current_trace != &no_tracer) { - cnt = -EBUSY; - pr_info("ftrace: set current_tracer to none" - " before modifying buffer size\n"); - goto out; - } - - if (val > global_trace.entries) { - long pages_requested; - unsigned long freeable_pages; - - /* make sure we have enough memory before mapping */ - pages_requested = - (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; - - /* account for each buffer (and max_tr) */ - pages_requested *= tracing_nr_buffers * 2; - - /* Check for overflow */ - if (pages_requested < 0) { - cnt = -ENOMEM; - goto out; - } - - freeable_pages = determine_dirtyable_memory(); - - /* we only allow to request 1/4 of useable memory */ - if (pages_requested > - ((freeable_pages + tracing_pages_allocated) / 4)) { - cnt = -ENOMEM; - goto out; - } - - while (global_trace.entries < val) { - if (trace_alloc_page()) { - cnt = -ENOMEM; - goto out; - } - /* double check that we don't go over the known pages */ - if (tracing_pages_allocated > pages_requested) - break; - } - - } else { - /* include the number of entries in val (inc of page entries) */ - while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) - trace_free_page(); - } - - /* check integrity */ - for_each_tracing_cpu(i) - check_pages(global_trace.data[i]); - - filp->f_pos += cnt; - - /* If check pages failed, return ENOMEM */ - if (tracing_disabled) - cnt = -ENOMEM; - out: - max_tr.entries = global_trace.entries; - mutex_unlock(&trace_types_lock); - - return cnt; -} - -static struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, -}; - -static struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, -}; - -static struct file_operations set_tracer_fops = { - .open = tracing_open_generic, - .read = tracing_set_trace_read, - .write = tracing_set_trace_write, -}; - -static struct file_operations tracing_pipe_fops = { - .open = tracing_open_pipe, - .poll = tracing_poll_pipe, - .read = tracing_read_pipe, - .release = tracing_release_pipe, -}; - -static struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, - .read = tracing_entries_read, - .write = tracing_entries_write, -}; - -#ifdef CONFIG_DYNAMIC_FTRACE - -static ssize_t -tracing_read_long(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - r = sprintf(buf, "%ld\n", *p); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static struct file_operations tracing_read_long_fops = { - .open = tracing_open_generic, - .read = tracing_read_long, -}; -#endif - -static struct dentry *d_tracer; - -struct dentry *tracing_init_dentry(void) -{ - static int once; - - if (d_tracer) - return d_tracer; - - d_tracer = debugfs_create_dir("tracing", NULL); - - if (!d_tracer && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } - - return d_tracer; -} - -#ifdef CONFIG_FTRACE_SELFTEST -/* Let selftest have access to static functions in this file */ -#include "trace_selftest.c" -#endif - -static __init void tracer_init_debugfs(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); - - entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); - - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - - entry = debugfs_create_file("latency_trace", 0444, d_tracer, - &global_trace, &tracing_lt_fops); - if (!entry) - pr_warning("Could not create debugfs 'latency_trace' entry\n"); - - entry = debugfs_create_file("trace", 0444, d_tracer, - &global_trace, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); - - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); - - entry = debugfs_create_file("trace_pipe", 0644, d_tracer, - NULL, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); - - entry = debugfs_create_file("trace_entries", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); - -#ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_read_long_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); -#endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif -} - -static int trace_alloc_page(void) -{ - struct trace_array_cpu *data; - struct page *page, *tmp; - LIST_HEAD(pages); - void *array; - unsigned pages_allocated = 0; - int i; - - /* first allocate a page for each CPU */ - for_each_tracing_cpu(i) { - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); - -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); -#endif - } - - /* Now that we successfully allocate a page per CPU, add them */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - ClearPageLRU(page); - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - SetPageLRU(page); -#endif - } - tracing_pages_allocated += pages_allocated; - global_trace.entries += ENTRIES_PER_PAGE; - - return 0; - - free_pages: - list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - return -ENOMEM; -} - -static int trace_free_page(void) -{ - struct trace_array_cpu *data; - struct page *page; - struct list_head *p; - int i; - int ret = 0; - - /* free one page from each buffer */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - tracing_pages_allocated--; - __free_page(page); - - tracing_reset(data); - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - tracing_pages_allocated--; - __free_page(page); - - tracing_reset(data); -#endif - } - global_trace.entries -= ENTRIES_PER_PAGE; - - return ret; -} - -__init static int tracer_alloc_buffers(void) -{ - struct trace_array_cpu *data; - void *array; - struct page *page; - int pages = 0; - int ret = -ENOMEM; - int i; - - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_nr_buffers = num_possible_cpus(); - tracing_buffer_mask = cpu_possible_map; - - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); - - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } - - /* set the array to the list */ - INIT_LIST_HEAD(&data->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &data->trace_pages); - /* use the LRU flag to differentiate the two buffers */ - ClearPageLRU(page); - - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } - - INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &max_tr.data[i]->trace_pages); - SetPageLRU(page); -#endif - } - - /* - * Since we allocate by orders of pages, we may be able to - * round up a bit. - */ - global_trace.entries = ENTRIES_PER_PAGE; - pages++; - - while (global_trace.entries < trace_nr_entries) { - if (trace_alloc_page()) - break; - pages++; - } - max_tr.entries = global_trace.entries; - - pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", - pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); - pr_info(" actual entries %ld\n", global_trace.entries); - - tracer_init_debugfs(); - - trace_init_cmdlines(); - - register_tracer(&no_tracer); - current_trace = &no_tracer; - - /* All seems OK, enable tracing */ - global_trace.ctrl = tracer_enabled; - tracing_disabled = 0; - - return 0; - - free_buffers: - for (i-- ; i >= 0; i--) { - struct page *page, *tmp; - struct trace_array_cpu *data = global_trace.data[i]; - - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } -#endif - } - return ret; -} -fs_initcall(tracer_alloc_buffers); -/* - * ring buffer based function tracer - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Based on code from the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include - -#include "trace.h" - -static void function_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); -} - -static void start_function_trace(struct trace_array *tr) -{ - tr->cpu = get_cpu(); - function_reset(tr); - put_cpu(); - - tracing_start_cmdline_record(); - tracing_start_function_trace(); -} - -static void stop_function_trace(struct trace_array *tr) -{ - tracing_stop_function_trace(); - tracing_stop_cmdline_record(); -} - -static void function_trace_init(struct trace_array *tr) -{ - if (tr->ctrl) - start_function_trace(tr); -} - -static void function_trace_reset(struct trace_array *tr) -{ - if (tr->ctrl) - stop_function_trace(tr); -} - -static void function_trace_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_function_trace(tr); - else - stop_function_trace(tr); -} - -static struct tracer function_trace __read_mostly = -{ - .name = "ftrace", - .init = function_trace_init, - .reset = function_trace_reset, - .ctrl_update = function_trace_ctrl_update, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_function, -#endif -}; - -static __init int init_function_trace(void) -{ - return register_tracer(&function_trace); -} - -device_initcall(init_function_trace); -/* - * trace irqs off criticall timings - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * From code in the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *irqsoff_trace __read_mostly; -static int tracer_enabled __read_mostly; - -static DEFINE_PER_CPU(int, tracing_cpu); - -static DEFINE_SPINLOCK(max_trace_lock); - -enum { - TRACER_IRQS_OFF = (1 << 1), - TRACER_PREEMPT_OFF = (1 << 2), -}; - -static int trace_type __read_mostly; - -#ifdef CONFIG_PREEMPT_TRACER -static inline int -preempt_trace(void) -{ - return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); -} -#else -# define preempt_trace() (0) -#endif - -#ifdef CONFIG_IRQSOFF_TRACER -static inline int -irq_trace(void) -{ - return ((trace_type & TRACER_IRQS_OFF) && - irqs_disabled()); -} -#else -# define irq_trace() (0) -#endif - -/* - * Sequence count - we record it when starting a measurement and - * skip the latency if the sequence has changed - some other section - * did a maximum and could disturb our measurement with serial console - * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt - * decrease the validity of the maximum found: - */ -static __cacheline_aligned_in_smp unsigned long max_sequence; - -#ifdef CONFIG_FTRACE -/* - * irqsoff uses its own tracer function to keep the overhead down: - */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - /* - * Does not matter if we preempt. We test the flags - * afterward, to see if irqs are disabled or not. - * If we preempt and get a false positive, the flags - * test will fail. - */ - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return; - - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); - - atomic_dec(&data->disabled); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, -}; -#endif /* CONFIG_FTRACE */ - -/* - * Should this new latency be reported/recorded? - */ -static int report_latency(cycle_t delta) -{ - if (tracing_thresh) { - if (delta < tracing_thresh) - return 0; - } else { - if (delta <= tracing_max_latency) - return 0; - } - return 1; -} - -static void -check_critical_timing(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long parent_ip, - int cpu) -{ - unsigned long latency, t0, t1; - cycle_t T0, T1, delta; - unsigned long flags; - - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ - T0 = data->preempt_timestamp; - T1 = ftrace_now(cpu); - delta = T1-T0; - - local_save_flags(flags); - - if (!report_latency(delta)) - goto out; - - spin_lock_irqsave(&max_trace_lock, flags); - - /* check if we are still the max latency */ - if (!report_latency(delta)) - goto out_unlock; - - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); - - latency = nsecs_to_usecs(delta); - - if (data->critical_sequence != max_sequence) - goto out_unlock; - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - data->critical_end = parent_ip; - - update_max_tr_single(tr, current, cpu); - - max_sequence++; - -out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); - -out: - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(data); - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); -} - -static inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip) -{ - int cpu; - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - - if (likely(!tracer_enabled)) - return; - - cpu = raw_smp_processor_id(); - - if (per_cpu(tracing_cpu, cpu)) - return; - - data = tr->data[cpu]; - - if (unlikely(!data) || atomic_read(&data->disabled)) - return; - - atomic_inc(&data->disabled); - - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; - tracing_reset(data); - - local_save_flags(flags); - - trace_function(tr, data, ip, parent_ip, flags); - - per_cpu(tracing_cpu, cpu) = 1; - - atomic_dec(&data->disabled); -} - -static inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip) -{ - int cpu; - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - - cpu = raw_smp_processor_id(); - /* Always clear the tracing cpu on stopping the trace */ - if (unlikely(per_cpu(tracing_cpu, cpu))) - per_cpu(tracing_cpu, cpu) = 0; - else - return; - - if (!tracer_enabled) - return; - - data = tr->data[cpu]; - - if (unlikely(!data) || unlikely(!head_page(data)) || - !data->critical_start || atomic_read(&data->disabled)) - return; - - atomic_inc(&data->disabled); - - local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); -} - -/* start and stop critical timings used to for stoppage (in idle) */ -void start_critical_timings(void) -{ - if (preempt_trace() || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(start_critical_timings); - -void stop_critical_timings(void) -{ - if (preempt_trace() || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(stop_critical_timings); - -#ifdef CONFIG_IRQSOFF_TRACER -#ifdef CONFIG_PROVE_LOCKING -void time_hardirqs_on(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); -} - -void time_hardirqs_off(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); -} - -#else /* !CONFIG_PROVE_LOCKING */ - -/* - * Stubs: - */ - -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - -/* - * We are only interested in hardirq on/off events: - */ -void trace_hardirqs_on(void) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -void trace_hardirqs_off(void) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -#endif /* CONFIG_PROVE_LOCKING */ -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) -{ - if (preempt_trace()) - stop_critical_timing(a0, a1); -} - -void trace_preempt_off(unsigned long a0, unsigned long a1) -{ - if (preempt_trace()) - start_critical_timing(a0, a1); -} -#endif /* CONFIG_PREEMPT_TRACER */ - -static void start_irqsoff_tracer(struct trace_array *tr) -{ - register_ftrace_function(&trace_ops); - tracer_enabled = 1; -} - -static void stop_irqsoff_tracer(struct trace_array *tr) -{ - tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); -} - -static void __irqsoff_tracer_init(struct trace_array *tr) -{ - irqsoff_trace = tr; - /* make sure that the tracer is visible */ - smp_wmb(); - - if (tr->ctrl) - start_irqsoff_tracer(tr); -} - -static void irqsoff_tracer_reset(struct trace_array *tr) -{ - if (tr->ctrl) - stop_irqsoff_tracer(tr); -} - -static void irqsoff_tracer_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_irqsoff_tracer(tr); - else - stop_irqsoff_tracer(tr); -} - -static void irqsoff_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_irqsoff_tracer(iter->tr); -} - -static void irqsoff_tracer_close(struct trace_iterator *iter) -{ - if (iter->tr->ctrl) - start_irqsoff_tracer(iter->tr); -} - -#ifdef CONFIG_IRQSOFF_TRACER -static void irqsoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_IRQS_OFF; - - __irqsoff_tracer_init(tr); -} -static struct tracer irqsoff_tracer __read_mostly = -{ - .name = "irqsoff", - .init = irqsoff_tracer_init, - .reset = irqsoff_tracer_reset, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, - .print_max = 1, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_irqsoff, -#endif -}; -# define register_irqsoff(trace) register_tracer(&trace) -#else -# define register_irqsoff(trace) do { } while (0) -#endif - -#ifdef CONFIG_PREEMPT_TRACER -static void preemptoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_PREEMPT_OFF; - - __irqsoff_tracer_init(tr); -} - -static struct tracer preemptoff_tracer __read_mostly = -{ - .name = "preemptoff", - .init = preemptoff_tracer_init, - .reset = irqsoff_tracer_reset, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, - .print_max = 1, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_preemptoff, -#endif -}; -# define register_preemptoff(trace) register_tracer(&trace) -#else -# define register_preemptoff(trace) do { } while (0) -#endif - -#if defined(CONFIG_IRQSOFF_TRACER) && \ - defined(CONFIG_PREEMPT_TRACER) - -static void preemptirqsoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - - __irqsoff_tracer_init(tr); -} - -static struct tracer preemptirqsoff_tracer __read_mostly = -{ - .name = "preemptirqsoff", - .init = preemptirqsoff_tracer_init, - .reset = irqsoff_tracer_reset, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, - .print_max = 1, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_preemptirqsoff, -#endif -}; - -# define register_preemptirqsoff(trace) register_tracer(&trace) -#else -# define register_preemptirqsoff(trace) do { } while (0) -#endif - -__init static int init_irqsoff_tracer(void) -{ - register_irqsoff(irqsoff_tracer); - register_preemptoff(preemptoff_tracer); - register_preemptirqsoff(preemptirqsoff_tracer); - - return 0; -} -device_initcall(init_irqsoff_tracer); -/* - * Memory mapped I/O tracing - * - * Copyright (C) 2008 Pekka Paalanen - */ - -#define DEBUG 1 - -#include -#include -#include - -#include "trace.h" - -struct header_iter { - struct pci_dev *dev; -}; - -static struct trace_array *mmio_trace_array; -static bool overrun_detected; - -static void mmio_reset_data(struct trace_array *tr) -{ - int cpu; - - overrun_detected = false; - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); -} - -static void mmio_trace_init(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - mmio_trace_array = tr; - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } -} - -static void mmio_trace_reset(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - if (tr->ctrl) - disable_mmiotrace(); - mmio_reset_data(tr); - mmio_trace_array = NULL; -} - -static void mmio_trace_ctrl_update(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } else { - disable_mmiotrace(); - } -} - -static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) -{ - int ret = 0; - int i; - resource_size_t start, end; - const struct pci_driver *drv = pci_dev_driver(dev); - - /* XXX: incomplete checks for trace_seq_printf() return value */ - ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", - dev->bus->number, dev->devfn, - dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ - for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", - (unsigned long long)(start | - (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); - } - for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", - dev->resource[i].start < dev->resource[i].end ? - (unsigned long long)(end - start) + 1 : 0); - } - if (drv) - ret += trace_seq_printf(s, " %s\n", drv->name); - else - ret += trace_seq_printf(s, " \n"); - return ret; -} - -static void destroy_header_iter(struct header_iter *hiter) -{ - if (!hiter) - return; - pci_dev_put(hiter->dev); - kfree(hiter); -} - -static void mmio_pipe_open(struct trace_iterator *iter) -{ - struct header_iter *hiter; - struct trace_seq *s = &iter->seq; - - trace_seq_printf(s, "VERSION 20070824\n"); - - hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); - if (!hiter) - return; - - hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); - iter->private = hiter; -} - -/* XXX: This is not called when the pipe is closed! */ -static void mmio_close(struct trace_iterator *iter) -{ - struct header_iter *hiter = iter->private; - destroy_header_iter(hiter); - iter->private = NULL; -} - -static unsigned long count_overruns(struct trace_iterator *iter) -{ - int cpu; - unsigned long cnt = 0; - for_each_online_cpu(cpu) { - cnt += iter->overrun[cpu]; - iter->overrun[cpu] = 0; - } - return cnt; -} - -static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, - char __user *ubuf, size_t cnt, loff_t *ppos) -{ - ssize_t ret; - struct header_iter *hiter = iter->private; - struct trace_seq *s = &iter->seq; - unsigned long n; - - n = count_overruns(iter); - if (n) { - /* XXX: This is later than where events were lost. */ - trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); - if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); - overrun_detected = true; - goto print_out; - } - - if (!hiter) - return 0; - - mmio_print_pcidev(s, hiter->dev); - hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); - - if (!hiter->dev) { - destroy_header_iter(hiter); - iter->private = NULL; - } - -print_out: - ret = trace_seq_to_user(s, ubuf, cnt); - return (ret == -EBUSY) ? 0 : ret; -} - -static int mmio_print_rw(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct mmiotrace_rw *rw = &entry->mmiorw; - struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); - unsigned long usec_rem = do_div(t, 1000000ULL); - unsigned secs = (unsigned long)t; - int ret = 1; - - switch (entry->mmiorw.opcode) { - case MMIO_READ: - ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - rw->value, rw->pc, 0); - break; - case MMIO_WRITE: - ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - rw->value, rw->pc, 0); - break; - case MMIO_UNKNOWN_OP: - ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", - secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, - (rw->value >> 0) & 0xff, rw->pc, 0); - break; - default: - ret = trace_seq_printf(s, "rw what?\n"); - break; - } - if (ret) - return 1; - return 0; -} - -static int mmio_print_map(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = &entry->mmiomap; - struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); - unsigned long usec_rem = do_div(t, 1000000ULL); - unsigned secs = (unsigned long)t; - int ret = 1; - - switch (entry->mmiorw.opcode) { - case MMIO_PROBE: - ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", - secs, usec_rem, m->map_id, - (unsigned long long)m->phys, m->virt, m->len, - 0UL, 0); - break; - case MMIO_UNPROBE: - ret = trace_seq_printf(s, - "UNMAP %lu.%06lu %d 0x%lx %d\n", - secs, usec_rem, m->map_id, 0UL, 0); - break; - default: - ret = trace_seq_printf(s, "map what?\n"); - break; - } - if (ret) - return 1; - return 0; -} - -/* return 0 to abort printing without consuming current entry in pipe mode */ -static int mmio_print_line(struct trace_iterator *iter) -{ - switch (iter->ent->type) { - case TRACE_MMIO_RW: - return mmio_print_rw(iter); - case TRACE_MMIO_MAP: - return mmio_print_map(iter); - default: - return 1; /* ignore unknown entries */ - } -} - -static struct tracer mmio_tracer __read_mostly = -{ - .name = "mmiotrace", - .init = mmio_trace_init, - .reset = mmio_trace_reset, - .pipe_open = mmio_pipe_open, - .close = mmio_close, - .read = mmio_read, - .ctrl_update = mmio_trace_ctrl_update, - .print_line = mmio_print_line, -}; - -__init static int init_mmio_trace(void) -{ - return register_tracer(&mmio_tracer); -} -device_initcall(init_mmio_trace); - -void mmio_trace_rw(struct mmiotrace_rw *rw) -{ - struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; - __trace_mmiotrace_rw(tr, data, rw); -} - -void mmio_trace_mapping(struct mmiotrace_map *map) -{ - struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = tr->data[smp_processor_id()]; - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); -} -/* - * trace context switch - * - * Copyright (C) 2007 Steven Rostedt - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *ctx_trace; -static int __read_mostly tracer_enabled; -static atomic_t sched_ref; - -static void -sched_switch_func(void *private, void *__rq, struct task_struct *prev, - struct task_struct *next) -{ - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - tracing_record_cmdline(prev); - tracing_record_cmdline(next); - - if (!tracer_enabled) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - tracing_sched_switch_trace(tr, data, prev, next, flags); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - if (!atomic_read(&sched_ref)) - return; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - sched_switch_func(probe_data, __rq, prev, next); -} - -static void -wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct - task_struct *curr) -{ - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (!tracer_enabled) - return; - - tracing_record_cmdline(curr); - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_func(probe_data, __rq, task, curr); -} - -static void sched_switch_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); -} - -static int tracing_sched_register(void) -{ - int ret; - - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); - if (ret) { - pr_info("wakeup trace: Couldn't add marker" - " probe to kernel_sched_wakeup\n"); - return ret; - } - - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); - if (ret) { - pr_info("wakeup trace: Couldn't add marker" - " probe to kernel_sched_wakeup_new\n"); - goto fail_deprobe; - } - - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &ctx_trace); - if (ret) { - pr_info("sched trace: Couldn't add marker" - " probe to kernel_sched_schedule\n"); - goto fail_deprobe_wake_new; - } - - return ret; -fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); -fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); - return ret; -} - -static void tracing_sched_unregister(void) -{ - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); -} - -static void tracing_start_sched_switch(void) -{ - long ref; - - ref = atomic_inc_return(&sched_ref); - if (ref == 1) - tracing_sched_register(); -} - -static void tracing_stop_sched_switch(void) -{ - long ref; - - ref = atomic_dec_and_test(&sched_ref); - if (ref) - tracing_sched_unregister(); -} - -void tracing_start_cmdline_record(void) -{ - tracing_start_sched_switch(); -} - -void tracing_stop_cmdline_record(void) -{ - tracing_stop_sched_switch(); -} - -static void start_sched_trace(struct trace_array *tr) -{ - sched_switch_reset(tr); - tracing_start_cmdline_record(); - tracer_enabled = 1; -} - -static void stop_sched_trace(struct trace_array *tr) -{ - tracer_enabled = 0; - tracing_stop_cmdline_record(); -} - -static void sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - - if (tr->ctrl) - start_sched_trace(tr); -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (tr->ctrl) - stop_sched_trace(tr); -} - -static void sched_switch_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_sched_trace(tr); - else - stop_sched_trace(tr); -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .ctrl_update = sched_switch_trace_ctrl_update, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - int ret = 0; - - if (atomic_read(&sched_ref)) - ret = tracing_sched_register(); - if (ret) { - pr_info("error registering scheduler trace\n"); - return ret; - } - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); -/* - * trace task wakeup timings - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Based on code from the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *wakeup_trace; -static int __read_mostly tracer_enabled; - -static struct task_struct *wakeup_task; -static int wakeup_cpu; -static unsigned wakeup_prio = -1; - -static raw_spinlock_t wakeup_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -static void __wakeup_reset(struct trace_array *tr); - -#ifdef CONFIG_FTRACE -/* - * irqsoff uses its own tracer function to keep the overhead down: - */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int resched; - int cpu; - - if (likely(!wakeup_task)) - return; - - resched = need_resched(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) - goto out; - - local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; - - trace_function(tr, data, ip, parent_ip, flags); - - unlock: - __raw_spin_unlock(&wakeup_lock); - local_irq_restore(flags); - - out: - atomic_dec(&data->disabled); - - /* - * To prevent recursion from the scheduler, if the - * resched flag was set before we entered, then - * don't reschedule. - */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, -}; -#endif /* CONFIG_FTRACE */ - -/* - * Should this new latency be reported/recorded? - */ -static int report_latency(cycle_t delta) -{ - if (tracing_thresh) { - if (delta < tracing_thresh) - return 0; - } else { - if (delta <= tracing_max_latency) - return 0; - } - return 1; -} - -static void notrace -wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, - struct task_struct *next) -{ - unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; - struct trace_array_cpu *data; - cycle_t T0, T1, delta; - unsigned long flags; - long disabled; - int cpu; - - if (unlikely(!tracer_enabled)) - return; - - /* - * When we start a new trace, we set wakeup_task to NULL - * and then set tracer_enabled = 1. We want to make sure - * that another CPU does not see the tracer_enabled = 1 - * and the wakeup_task with an older task, that might - * actually be the same as next. - */ - smp_rmb(); - - if (next != wakeup_task) - return; - - /* The task we are waiting for is waking up */ - data = tr->data[wakeup_cpu]; - - /* disable local data, not wakeup_cpu data */ - cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&tr->data[cpu]->disabled); - if (likely(disabled != 1)) - goto out; - - local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - /* We could race with grabbing wakeup_lock */ - if (unlikely(!tracer_enabled || next != wakeup_task)) - goto out_unlock; - - trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); - - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ - T0 = data->preempt_timestamp; - T1 = ftrace_now(cpu); - delta = T1-T0; - - if (!report_latency(delta)) - goto out_unlock; - - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(tr, wakeup_task, wakeup_cpu); - -out_unlock: - __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); - local_irq_restore(flags); -out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - tracing_record_cmdline(prev); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - wakeup_sched_switch(probe_data, __rq, prev, next); -} - -static void __wakeup_reset(struct trace_array *tr) -{ - struct trace_array_cpu *data; - int cpu; - - for_each_possible_cpu(cpu) { - data = tr->data[cpu]; - tracing_reset(data); - } - - wakeup_cpu = -1; - wakeup_prio = -1; - - if (wakeup_task) - put_task_struct(wakeup_task); - - wakeup_task = NULL; -} - -static void wakeup_reset(struct trace_array *tr) -{ - unsigned long flags; - - local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); - local_irq_restore(flags); -} - -static void -wakeup_check_start(struct trace_array *tr, struct task_struct *p, - struct task_struct *curr) -{ - int cpu = smp_processor_id(); - unsigned long flags; - long disabled; - - if (likely(!rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= curr->prio) - return; - - disabled = atomic_inc_return(&tr->data[cpu]->disabled); - if (unlikely(disabled != 1)) - goto out; - - /* interrupts should be off from try_to_wake_up */ - __raw_spin_lock(&wakeup_lock); - - /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) - goto out_locked; - - /* reset the trace */ - __wakeup_reset(tr); - - wakeup_cpu = task_cpu(p); - wakeup_prio = p->prio; - - wakeup_task = p; - get_task_struct(wakeup_task); - - local_save_flags(flags); - - tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(tr, tr->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags); - -out_locked: - __raw_spin_unlock(&wakeup_lock); -out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct trace_array **ptr = probe_data; - struct trace_array *tr = *ptr; - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_check_start(tr, task, curr); -} - -static void start_wakeup_tracer(struct trace_array *tr) -{ - int ret; - - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); - if (ret) { - pr_info("wakeup trace: Couldn't add marker" - " probe to kernel_sched_wakeup\n"); - return; - } - - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); - if (ret) { - pr_info("wakeup trace: Couldn't add marker" - " probe to kernel_sched_wakeup_new\n"); - goto fail_deprobe; - } - - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &wakeup_trace); - if (ret) { - pr_info("sched trace: Couldn't add marker" - " probe to kernel_sched_schedule\n"); - goto fail_deprobe_wake_new; - } - - wakeup_reset(tr); - - /* - * Don't let the tracer_enabled = 1 show up before - * the wakeup_task is reset. This may be overkill since - * wakeup_reset does a spin_unlock after setting the - * wakeup_task to NULL, but I want to be safe. - * This is a slow path anyway. - */ - smp_wmb(); - - register_ftrace_function(&trace_ops); - - tracer_enabled = 1; - - return; -fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); -fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); -} - -static void stop_wakeup_tracer(struct trace_array *tr) -{ - tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); -} - -static void wakeup_tracer_init(struct trace_array *tr) -{ - wakeup_trace = tr; - - if (tr->ctrl) - start_wakeup_tracer(tr); -} - -static void wakeup_tracer_reset(struct trace_array *tr) -{ - if (tr->ctrl) { - stop_wakeup_tracer(tr); - /* make sure we put back any tasks we are tracing */ - wakeup_reset(tr); - } -} - -static void wakeup_tracer_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_wakeup_tracer(tr); - else - stop_wakeup_tracer(tr); -} - -static void wakeup_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_wakeup_tracer(iter->tr); -} - -static void wakeup_tracer_close(struct trace_iterator *iter) -{ - /* forget about any processes we were recording */ - if (iter->tr->ctrl) - start_wakeup_tracer(iter->tr); -} - -static struct tracer wakeup_tracer __read_mostly = -{ - .name = "wakeup", - .init = wakeup_tracer_init, - .reset = wakeup_tracer_reset, - .open = wakeup_tracer_open, - .close = wakeup_tracer_close, - .ctrl_update = wakeup_tracer_ctrl_update, - .print_max = 1, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_wakeup, -#endif -}; - -__init static int init_wakeup_tracer(void) -{ - int ret; - - ret = register_tracer(&wakeup_tracer); - if (ret) - return ret; - - return 0; -} -device_initcall(init_wakeup_tracer); -/* Include in trace.c */ - -#include -#include - -static inline int trace_valid_entry(struct trace_entry *entry) -{ - switch (entry->type) { - case TRACE_FN: - case TRACE_CTX: - case TRACE_WAKE: - case TRACE_STACK: - case TRACE_SPECIAL: - return 1; - } - return 0; -} - -static int -trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) -{ - struct trace_entry *entries; - struct page *page; - int idx = 0; - int i; - - BUG_ON(list_empty(&data->trace_pages)); - page = list_entry(data->trace_pages.next, struct page, lru); - entries = page_address(page); - - check_pages(data); - if (head_page(data) != entries) - goto failed; - - /* - * The starting trace buffer always has valid elements, - * if any element exists. - */ - entries = head_page(data); - - for (i = 0; i < tr->entries; i++) { - - if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { - printk(KERN_CONT ".. invalid entry %d ", - entries[idx].type); - goto failed; - } - - idx++; - if (idx >= ENTRIES_PER_PAGE) { - page = virt_to_page(entries); - if (page->lru.next == &data->trace_pages) { - if (i != tr->entries - 1) { - printk(KERN_CONT ".. entries buffer mismatch"); - goto failed; - } - } else { - page = list_entry(page->lru.next, struct page, lru); - entries = page_address(page); - } - idx = 0; - } - } - - page = virt_to_page(entries); - if (page->lru.next != &data->trace_pages) { - printk(KERN_CONT ".. too many entries"); - goto failed; - } - - return 0; - - failed: - /* disable tracing */ - tracing_disabled = 1; - printk(KERN_CONT ".. corrupted trace buffer .. "); - return -1; -} - -/* - * Test the trace buffer to see if all the elements - * are still sane. - */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) -{ - unsigned long flags, cnt = 0; - int cpu, ret = 0; - - /* Don't allow flipping of max traces now */ - raw_local_irq_save(flags); - __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - - cnt += tr->data[cpu]->trace_idx; - - ret = trace_test_buffer_cpu(tr, tr->data[cpu]); - if (ret) - break; - } - __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_restore(flags); - - if (count) - *count = cnt; - - return ret; -} - -#ifdef CONFIG_FTRACE - -#ifdef CONFIG_DYNAMIC_FTRACE - -#define __STR(x) #x -#define STR(x) __STR(x) - -/* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) -{ - unsigned long count; - int ret; - int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; - char *func_name; - - /* The ftrace test PASSED */ - printk(KERN_CONT "PASSED\n"); - pr_info("Testing dynamic ftrace: "); - - /* enable tracing, and record the filter function */ - ftrace_enabled = 1; - tracer_enabled = 1; - - /* passed in by parameter to fool gcc from optimizing */ - func(); - - /* update the records */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - - /* - * Some archs *cough*PowerPC*cough* add charachters to the - * start of the function names. We simply put a '*' to - * accomodate them. - */ - func_name = "*" STR(DYN_FTRACE_TEST_NAME); - - /* filter only on our function */ - ftrace_set_filter(func_name, strlen(func_name), 1); - - /* enable tracing */ - tr->ctrl = 1; - trace->init(tr); - /* Sleep for a 1/10 of a second */ - msleep(100); - - /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); - if (ret) - goto out; - - if (count) { - ret = -1; - printk(KERN_CONT ".. filter did not filter .. "); - goto out; - } - - /* call our function again */ - func(); - - /* sleep again */ - msleep(100); - - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - ftrace_enabled = 0; - - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - - /* we should only have one item */ - if (!ret && count != 1) { - printk(KERN_CONT ".. filter failed count=%ld ..", count); - ret = -1; - goto out; - } - out: - ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; - - /* Enable tracing on all functions again */ - ftrace_set_filter(NULL, 0, 1); - - return ret; -} -#else -# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) -#endif /* CONFIG_DYNAMIC_FTRACE */ -/* - * Simple verification test of ftrace function tracer. - * Enable ftrace, sleep 1/10 second, and then read the trace - * buffer to see if all is in order. - */ -int -trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; - - /* make sure msleep has been recorded */ - msleep(1); - - /* force the recorded functions to be traced */ - ret = ftrace_force_update(); - if (ret) { - printk(KERN_CONT ".. ftraced failed .. "); - return ret; - } - - /* start the tracing */ - ftrace_enabled = 1; - tracer_enabled = 1; - - tr->ctrl = 1; - trace->init(tr); - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - ftrace_enabled = 0; - - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - ret = trace_selftest_startup_dynamic_tracing(trace, tr, - DYN_FTRACE_TEST_NAME); - - out: - ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; - - /* kill ftrace totally if we failed */ - if (ret) - ftrace_kill(); - - return ret; -} -#endif /* CONFIG_FTRACE */ - -#ifdef CONFIG_IRQSOFF_TRACER -int -trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - /* reset the max latency */ - tracing_max_latency = 0; - /* disable interrupts for a bit */ - local_irq_disable(); - udelay(100); - local_irq_enable(); - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - trace->reset(tr); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -int -trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - /* reset the max latency */ - tracing_max_latency = 0; - /* disable preemption for a bit */ - preempt_disable(); - udelay(100); - preempt_enable(); - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - trace->reset(tr); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_PREEMPT_TRACER */ - -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) -int -trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - - /* reset the max latency */ - tracing_max_latency = 0; - - /* disable preemption and interrupts for a bit */ - preempt_disable(); - local_irq_disable(); - udelay(100); - preempt_enable(); - /* reverse the order of preempt vs irqs */ - local_irq_enable(); - - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (ret) - goto out; - - ret = trace_test_buffer(&max_tr, &count); - if (ret) - goto out; - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; - tr->ctrl = 1; - trace->ctrl_update(tr); - preempt_disable(); - local_irq_disable(); - udelay(100); - preempt_enable(); - /* reverse the order of preempt vs irqs */ - local_irq_enable(); - - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (ret) - goto out; - - ret = trace_test_buffer(&max_tr, &count); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - out: - trace->reset(tr); - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ - -#ifdef CONFIG_SCHED_TRACER -static int trace_wakeup_test_thread(void *data) -{ - /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; - struct completion *x = data; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* Make it know we have a new prio */ - complete(x); - - /* now go to sleep and let the test wake us up */ - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* we are awake, now wait to disappear */ - while (!kthread_should_stop()) { - /* - * This is an RT task, do short sleeps to let - * others run. - */ - msleep(100); - } - - return 0; -} - -int -trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - struct task_struct *p; - struct completion isrt; - unsigned long count; - int ret; - - init_completion(&isrt); - - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); - if (IS_ERR(p)) { - printk(KERN_CONT "Failed to create ftrace wakeup test thread "); - return -1; - } - - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - /* reset the max latency */ - tracing_max_latency = 0; - - /* sleep to let the RT thread sleep too */ - msleep(100); - - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ - - wake_up_process(p); - - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - - - trace->reset(tr); - - tracing_max_latency = save_max; - - /* kill the thread */ - kthread_stop(p); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SCHED_TRACER */ - -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -int -trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - tr->ctrl = 1; - trace->init(tr); - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ -#include "trace.h" - -int DYN_FTRACE_TEST_NAME(void) -{ - /* used to call mcount */ - return 0; -} -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -const static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); -} - -static void start_stack_timers(void) -{ - cpumask_t saved_mask = current->cpus_allowed; - int cpu; - - for_each_online_cpu(cpu) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - start_stack_timer(cpu); - } - set_cpus_allowed_ptr(current, &saved_mask); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stack_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); -} - -static void start_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stack_reset(tr); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static void stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - if (tr->ctrl) - start_stack_trace(tr); -} - -static void stack_trace_reset(struct trace_array *tr) -{ - if (tr->ctrl) - stop_stack_trace(tr); -} - -static void stack_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_stack_trace(tr); - else - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, - .ctrl_update = stack_trace_ctrl_update, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - struct dentry *entry; - - entry = debugfs_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); -} -/* - * tsacct.c - System accounting over taskstats interface - * - * Copyright (C) Jay Lan, - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include - -/* - * fill in basic accounting fields - */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) -{ - struct timespec uptime, ts; - u64 ac_etime; - - BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - - /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; - if (thread_group_leader(tsk)) { - stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } - if (tsk->flags & PF_SUPERPRIV) - stats->ac_flag |= ASU; - if (tsk->flags & PF_DUMPCORE) - stats->ac_flag |= ACORE; - if (tsk->flags & PF_SIGNALED) - stats->ac_flag |= AXSIG; - stats->ac_nice = task_nice(tsk); - stats->ac_sched = tsk->policy; - stats->ac_uid = tsk->uid; - stats->ac_gid = tsk->gid; - stats->ac_pid = tsk->pid; - rcu_read_lock(); - stats->ac_ppid = pid_alive(tsk) ? - rcu_dereference(tsk->real_parent)->tgid : 0; - rcu_read_unlock(); - stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; - stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; - stats->ac_utimescaled = - cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; - stats->ac_stimescaled = - cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; - stats->ac_minflt = tsk->min_flt; - stats->ac_majflt = tsk->maj_flt; - - strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); -} - - -#ifdef CONFIG_TASK_XACCT - -#define KB 1024 -#define MB (1024*KB) -/* - * fill in extended accounting fields - */ -void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) -{ - struct mm_struct *mm; - - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; - mm = get_task_mm(p); - if (mm) { - /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; - mmput(mm); - } - stats->read_char = p->ioac.rchar; - stats->write_char = p->ioac.wchar; - stats->read_syscalls = p->ioac.syscr; - stats->write_syscalls = p->ioac.syscw; -#ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; -#else - stats->read_bytes = 0; - stats->write_bytes = 0; - stats->cancelled_write_bytes = 0; -#endif -} -#undef KB -#undef MB - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = tsk->stime + tsk->utime; - dtime = cputime_sub(time, tsk->acct_timexpd); - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_timexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} -#endif -/* - * Wrapper functions for 16bit uid back compatibility. All nicely tied - * together in the faint hope we can take the out in five years time. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; -} - -SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; -} - -SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, fd, user, group); - return ret; -} - -SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) -{ - long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, rgid, egid); - return ret; -} - -SYSCALL_DEFINE1(setgid16, old_gid_t, gid) -{ - long ret = sys_setgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; -} - -SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) -{ - long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, ruid, euid); - return ret; -} - -SYSCALL_DEFINE1(setuid16, old_uid_t, uid) -{ - long ret = sys_setuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; -} - -SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) -{ - long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), - low2highuid(suid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, ruid, euid, suid); - return ret; -} - -SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) -{ - int retval; - - if (!(retval = put_user(high2lowuid(current->uid), ruid)) && - !(retval = put_user(high2lowuid(current->euid), euid))) - retval = put_user(high2lowuid(current->suid), suid); - - return retval; -} - -SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) -{ - long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), - low2highgid(sgid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, rgid, egid, sgid); - return ret; -} - - -SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) -{ - int retval; - - if (!(retval = put_user(high2lowgid(current->gid), rgid)) && - !(retval = put_user(high2lowgid(current->egid), egid))) - retval = put_user(high2lowgid(current->sgid), sgid); - - return retval; -} - -SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) -{ - long ret = sys_setfsuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; -} - -SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) -{ - long ret = sys_setfsgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; -} - -static int groups16_to_user(old_gid_t __user *grouplist, - struct group_info *group_info) -{ - int i; - old_gid_t group; - - for (i = 0; i < group_info->ngroups; i++) { - group = high2lowgid(GROUP_AT(group_info, i)); - if (put_user(group, grouplist+i)) - return -EFAULT; - } - - return 0; -} - -static int groups16_from_user(struct group_info *group_info, - old_gid_t __user *grouplist) -{ - int i; - old_gid_t group; - - for (i = 0; i < group_info->ngroups; i++) { - if (get_user(group, grouplist+i)) - return -EFAULT; - GROUP_AT(group_info, i) = low2highgid(group); - } - - return 0; -} - -SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist) -{ - int i = 0; - - if (gidsetsize < 0) - return -EINVAL; - - get_group_info(current->group_info); - i = current->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups16_to_user(grouplist, current->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - put_group_info(current->group_info); - return i; -} - -SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) -{ - struct group_info *group_info; - int retval; - - if (!capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups16_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } - - retval = set_current_groups(group_info); - put_group_info(group_info); - - return retval; -} - -SYSCALL_DEFINE0(getuid16) -{ - return high2lowuid(current->uid); -} - -SYSCALL_DEFINE0(geteuid16) -{ - return high2lowuid(current->euid); -} - -SYSCALL_DEFINE0(getgid16) -{ - return high2lowgid(current->gid); -} - -SYSCALL_DEFINE0(getegid16) -{ - return high2lowgid(current->egid); -} -/* - * The "user cache". - * - * (C) Copyright 1991-2000 Linus Torvalds - * - * We have a per-user structure to keep track of how many - * processes, files etc the user has claimed, in order to be - * able to have per-user limits for system resources. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct user_namespace init_user_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .root_user = &root_user, -}; -EXPORT_SYMBOL_GPL(init_user_ns); - -/* - * UID task count cache, to get fast user lookup in "alloc_uid" - * when changing user ID's (ie setuid() and friends). - */ - -#define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) - -static struct kmem_cache *uid_cachep; - -/* - * The uidhash_lock is mostly taken from process context, but it is - * occasionally also taken from softirq/tasklet context, when - * task-structs get RCU-freed. Hence all locking must be softirq-safe. - * But free_uid() is also called with local interrupts disabled, and running - * local_bh_enable() with local interrupts disabled is an error - we'll run - * softirq callbacks, and they can unconditionally enable interrupts, and - * the caller of free_uid() didn't expect that.. - */ -static DEFINE_SPINLOCK(uidhash_lock); - -struct user_struct root_user = { - .__count = ATOMIC_INIT(1), - .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), - .sigpending = ATOMIC_INIT(0), - .locked_shm = 0, -#ifdef CONFIG_USER_SCHED - .tg = &init_task_group, -#endif -}; - -/* - * These routines must be called with the uidhash spinlock held! - */ -static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) -{ - hlist_add_head(&up->uidhash_node, hashent); -} - -static void uid_hash_remove(struct user_struct *up) -{ - hlist_del_init(&up->uidhash_node); -} - -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) -{ - struct user_struct *user; - struct hlist_node *h; - - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { - atomic_inc(&user->__count); - return user; - } - } - - return NULL; -} - -#ifdef CONFIG_USER_SCHED - -static void sched_destroy_user(struct user_struct *up) -{ - sched_destroy_group(up->tg); -} - -static int sched_create_user(struct user_struct *up) -{ - int rc = 0; - - up->tg = sched_create_group(&root_task_group); - if (IS_ERR(up->tg)) - rc = -ENOMEM; - - return rc; -} - -static void sched_switch_user(struct task_struct *p) -{ - sched_move_task(p); -} - -#else /* CONFIG_USER_SCHED */ - -static void sched_destroy_user(struct user_struct *up) { } -static int sched_create_user(struct user_struct *up) { return 0; } -static void sched_switch_user(struct task_struct *p) { } - -#endif /* CONFIG_USER_SCHED */ - -#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) - -static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ -static DEFINE_MUTEX(uids_mutex); - -static inline void uids_mutex_lock(void) -{ - mutex_lock(&uids_mutex); -} - -static inline void uids_mutex_unlock(void) -{ - mutex_unlock(&uids_mutex); -} - -/* uid directory attributes */ -#ifdef CONFIG_FAIR_GROUP_SCHED -static ssize_t cpu_shares_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); -} - -static ssize_t cpu_shares_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long shares; - int rc; - - sscanf(buf, "%lu", &shares); - - rc = sched_group_set_shares(up->tg, shares); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_share_attr = - __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -static ssize_t cpu_rt_runtime_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); -} - -static ssize_t cpu_rt_runtime_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_runtime; - int rc; - - sscanf(buf, "%lu", &rt_runtime); - - rc = sched_group_set_rt_runtime(up->tg, rt_runtime); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_runtime_attr = - __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); - -static ssize_t cpu_rt_period_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - - return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); -} - -static ssize_t cpu_rt_period_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t size) -{ - struct user_struct *up = container_of(kobj, struct user_struct, kobj); - unsigned long rt_period; - int rc; - - sscanf(buf, "%lu", &rt_period); - - rc = sched_group_set_rt_period(up->tg, rt_period); - - return (rc ? rc : size); -} - -static struct kobj_attribute cpu_rt_period_attr = - __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); -#endif - -/* default attributes per uid directory */ -static struct attribute *uids_attributes[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - &cpu_share_attr.attr, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - &cpu_rt_runtime_attr.attr, - &cpu_rt_period_attr.attr, -#endif - NULL -}; - -/* the lifetime of user_struct is not managed by the core (now) */ -static void uids_release(struct kobject *kobj) -{ - return; -} - -static struct kobj_type uids_ktype = { - .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = uids_attributes, - .release = uids_release, -}; - -/* create /sys/kernel/uids//cpu_share file for this user */ -static int uids_user_create(struct user_struct *up) -{ - struct kobject *kobj = &up->kobj; - int error; - - memset(kobj, 0, sizeof(struct kobject)); - kobj->kset = uids_kset; - error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); - if (error) { - kobject_put(kobj); - goto done; - } - - kobject_uevent(kobj, KOBJ_ADD); -done: - return error; -} - -/* create these entries in sysfs: - * "/sys/kernel/uids" directory - * "/sys/kernel/uids/0" directory (for root user) - * "/sys/kernel/uids/0/cpu_share" file (for root user) - */ -int __init uids_sysfs_init(void) -{ - uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); - if (!uids_kset) - return -ENOMEM; - - return uids_user_create(&root_user); -} - -/* work function to remove sysfs directory for a user and free up - * corresponding structures. - */ -static void remove_user_sysfs_dir(struct work_struct *w) -{ - struct user_struct *up = container_of(w, struct user_struct, work); - unsigned long flags; - int remove_user = 0; - - /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() - * atomic. - */ - uids_mutex_lock(); - - local_irq_save(flags); - - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - remove_user = 1; - spin_unlock_irqrestore(&uidhash_lock, flags); - } else { - local_irq_restore(flags); - } - - if (!remove_user) - goto done; - - kobject_uevent(&up->kobj, KOBJ_REMOVE); - kobject_del(&up->kobj); - kobject_put(&up->kobj); - - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - -done: - uids_mutex_unlock(); -} - -/* IRQs are disabled and uidhash_lock is held upon function entry. - * IRQ state (as stored in flags) is restored and uidhash_lock released - * upon function exit. - */ -static inline void free_user(struct user_struct *up, unsigned long flags) -{ - /* restore back the count */ - atomic_inc(&up->__count); - spin_unlock_irqrestore(&uidhash_lock, flags); - - INIT_WORK(&up->work, remove_user_sysfs_dir); - schedule_work(&up->work); -} - -#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ - -int uids_sysfs_init(void) { return 0; } -static inline int uids_user_create(struct user_struct *up) { return 0; } -static inline void uids_mutex_lock(void) { } -static inline void uids_mutex_unlock(void) { } - -/* IRQs are disabled and uidhash_lock is held upon function entry. - * IRQ state (as stored in flags) is restored and uidhash_lock released - * upon function exit. - */ -static inline void free_user(struct user_struct *up, unsigned long flags) -{ - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); -} - -#endif - -/* - * Locate the user_struct for the passed UID. If found, take a ref on it. The - * caller must undo that ref with free_uid(). - * - * If the user_struct could not be found, return NULL. - */ -struct user_struct *find_user(uid_t uid) -{ - struct user_struct *ret; - unsigned long flags; - struct user_namespace *ns = current->nsproxy->user_ns; - - spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(ns, uid)); - spin_unlock_irqrestore(&uidhash_lock, flags); - return ret; -} - -void free_uid(struct user_struct *up) -{ - unsigned long flags; - - if (!up) - return; - - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) - free_user(up, flags); - else - local_irq_restore(flags); -} - -struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) -{ - struct hlist_head *hashent = uidhashentry(ns, uid); - struct user_struct *up, *new; - - /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() - * atomic. - */ - uids_mutex_lock(); - - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); - spin_unlock_irq(&uidhash_lock); - - if (!up) { - new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); - if (!new) - goto out_unlock; - - new->uid = uid; - atomic_set(&new->__count, 1); - - if (sched_create_user(new) < 0) - goto out_free_user; - - if (uids_user_create(new)) - goto out_destoy_sched; - - /* - * Before adding this, check whether we raced - * on adding the same user already.. - */ - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); - if (up) { - /* This case is not possible when CONFIG_USER_SCHED - * is defined, since we serialize alloc_uid() using - * uids_mutex. Hence no need to call - * sched_destroy_user() or remove_user_sysfs_dir(). - */ - key_put(new->uid_keyring); - key_put(new->session_keyring); - kmem_cache_free(uid_cachep, new); - } else { - uid_hash_insert(new, hashent); - up = new; - } - spin_unlock_irq(&uidhash_lock); - - } - - uids_mutex_unlock(); - - return up; - -out_destoy_sched: - sched_destroy_user(new); -out_free_user: - kmem_cache_free(uid_cachep, new); -out_unlock: - uids_mutex_unlock(); - return NULL; -} - -void switch_uid(struct user_struct *new_user) -{ - struct user_struct *old_user; - - /* What if a process setreuid()'s and this brings the - * new uid over his NPROC rlimit? We can check this now - * cheaply with the new uid cache, so if it matters - * we should be checking for it. -DaveM - */ - old_user = current->user; - atomic_inc(&new_user->processes); - atomic_dec(&old_user->processes); - switch_uid_keyring(new_user); - current->user = new_user; - sched_switch_user(current); - - /* - * We need to synchronize with __sigqueue_alloc() - * doing a get_uid(p->user).. If that saw the old - * user value, we need to wait until it has exited - * its critical region before we can free the old - * structure. - */ - smp_mb(); - spin_unlock_wait(¤t->sighand->siglock); - - free_uid(old_user); - suid_keys(current); -} - -#ifdef CONFIG_USER_NS -void release_uids(struct user_namespace *ns) -{ - int i; - unsigned long flags; - struct hlist_head *head; - struct hlist_node *nd; - - spin_lock_irqsave(&uidhash_lock, flags); - /* - * collapse the chains so that the user_struct-s will - * be still alive, but not in hashes. subsequent free_uid() - * will free them. - */ - for (i = 0; i < UIDHASH_SZ; i++) { - head = ns->uidhash_table + i; - while (!hlist_empty(head)) { - nd = head->first; - hlist_del_init(nd); - } - } - spin_unlock_irqrestore(&uidhash_lock, flags); - - free_uid(ns->root_user); -} -#endif - -static int __init uid_cache_init(void) -{ - int n; - - uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - - for(n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); - - /* Insert the root user immediately (init already runs as root) */ - spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); - spin_unlock_irq(&uidhash_lock); - - return 0; -} - -module_init(uid_cache_init); -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include - -/* - * Clone a new ns copying an original user ns, setting refcount to 1 - * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise - */ -static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) -{ - struct user_namespace *ns; - struct user_struct *new_user; - int n; - - ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); - if (!ns) - return ERR_PTR(-ENOMEM); - - kref_init(&ns->kref); - - for (n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(ns->uidhash_table + n); - - /* Insert new root user. */ - ns->root_user = alloc_uid(ns, 0); - if (!ns->root_user) { - kfree(ns); - return ERR_PTR(-ENOMEM); - } - - /* Reset current->user with a new one */ - new_user = alloc_uid(ns, current->uid); - if (!new_user) { - free_uid(ns->root_user); - kfree(ns); - return ERR_PTR(-ENOMEM); - } - - switch_uid(new_user); - return ns; -} - -struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns) -{ - struct user_namespace *new_ns; - - BUG_ON(!old_ns); - get_user_ns(old_ns); - - if (!(flags & CLONE_NEWUSER)) - return old_ns; - - new_ns = clone_user_ns(old_ns); - - put_user_ns(old_ns); - return new_ns; -} - -void free_user_ns(struct kref *kref) -{ - struct user_namespace *ns; - - ns = container_of(kref, struct user_namespace, kref); - release_uids(ns); - kfree(ns); -} -EXPORT_SYMBOL(free_user_ns); -/* - * Copyright (C) 2004 IBM Corporation - * - * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include -#include - -/* - * Clone a new ns copying an original utsname, setting refcount to 1 - * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise - */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) -{ - struct uts_namespace *ns; - - ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); - if (!ns) - return ERR_PTR(-ENOMEM); - - down_read(&uts_sem); - memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - up_read(&uts_sem); - kref_init(&ns->kref); - return ns; -} - -/* - * Copy task tsk's utsname namespace, or clone it if flags - * specifies CLONE_NEWUTS. In latter case, changes to the - * utsname of this process won't be seen by parent, and vice - * versa. - */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) -{ - struct uts_namespace *new_ns; - - BUG_ON(!old_ns); - get_uts_ns(old_ns); - - if (!(flags & CLONE_NEWUTS)) - return old_ns; - - new_ns = clone_uts_ns(old_ns); - - put_uts_ns(old_ns); - return new_ns; -} - -void free_uts_ns(struct kref *kref) -{ - struct uts_namespace *ns; - - ns = container_of(kref, struct uts_namespace, kref); - kfree(ns); -} -/* - * Copyright (C) 2007 - * - * Author: Eric Biederman - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include - -static void *get_uts(ctl_table *table, int write) -{ - char *which = table->data; - struct uts_namespace *uts_ns; - - uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); - return which; -} - -static void put_uts(ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - -#ifdef CONFIG_PROC_FS -/* - * Special case of dostring for the UTS structure. This has locks - * to observe. Should this be in kernel/sys.c ???? - */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table uts_table; - int r; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); - put_uts(table, write, uts_table.data); - return r; -} -#else -#define proc_do_uts_string NULL -#endif - - -#ifdef CONFIG_SYSCTL_SYSCALL -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} -#else -#define sysctl_uts_string NULL -#endif - -static struct ctl_table uts_kern_table[] = { - { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = init_uts_ns.name.sysname, - .maxlen = sizeof(init_uts_ns.name.sysname), - .mode = 0444, - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = init_uts_ns.name.release, - .maxlen = sizeof(init_uts_ns.name.release), - .mode = 0444, - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = init_uts_ns.name.version, - .maxlen = sizeof(init_uts_ns.name.version), - .mode = 0444, - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = init_uts_ns.name.nodename, - .maxlen = sizeof(init_uts_ns.name.nodename), - .mode = 0644, - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = init_uts_ns.name.domainname, - .maxlen = sizeof(init_uts_ns.name.domainname), - .mode = 0644, - .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, - }, - {} -}; - -static struct ctl_table uts_root_table[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0555, - .child = uts_kern_table, - }, - {} -}; - -static int __init utsname_sysctl_init(void) -{ - register_sysctl_table(uts_root_table); - return 0; -} - -__initcall(utsname_sysctl_init); -/* - * Generic waiting primitives. - * - * (C) 2004 William Irwin, Oracle - */ -#include -#include -#include -#include -#include -#include - -void init_waitqueue_head(wait_queue_head_t *q) -{ - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->task_list); -} - -EXPORT_SYMBOL(init_waitqueue_head); - -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue); - -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue_exclusive); - -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the critical region). - */ -void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait); - -void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -/* - * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_wait); - -/* - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @state: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} -EXPORT_SYMBOL(autoremove_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - do { - int ret; - - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(q->key.flags); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); -/* - * linux/kernel/workqueue.c - * - * Generic mechanism for defining kernel helper threads for running - * arbitrary tasks in process context. - * - * Started by Ingo Molnar, Copyright (C) 2002 - * - * Derived from the taskqueue/keventd code by: - * - * David Woodhouse - * Andrew Morton - * Kai Petzke - * Theodore Ts'o - * - * Made to use alloc_percpu by Christoph Lameter. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). - */ -struct cpu_workqueue_struct { - - spinlock_t lock; - - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; - - struct workqueue_struct *wq; - struct task_struct *thread; - - int run_depth; /* Detect run_workqueue() recursion depth */ -} ____cacheline_aligned; - -/* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: - */ -struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; - -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); - -static int singlethread_cpu __read_mostly; -static cpumask_t cpu_singlethread_map __read_mostly; -/* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. - */ -static cpumask_t cpu_populated_map __read_mostly; - -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_single_threaded(struct workqueue_struct *wq) -{ - return wq->singlethread; -} - -static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) -{ - return is_single_threaded(wq) - ? &cpu_singlethread_map : &cpu_populated_map; -} - -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) -{ - if (unlikely(is_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); -} - -/* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set - */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) -{ - unsigned long new; - - BUG_ON(!work_pending(work)); - - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); -} - -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) -{ - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); -} - -static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) -{ - set_wq_data(work, cwq); - /* - * Ensure that we get the right work->data if we see the - * result of list_add() below, see try_to_grab_pending(). - */ - smp_wmb(); - list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); -} - -static void __queue_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) -{ - unsigned long flags; - - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); -} - -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; -} -EXPORT_SYMBOL_GPL(queue_work); - -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) -{ - int ret = 0; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_work_on); - -static void delayed_work_timer_fn(unsigned long __data) -{ - struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; - - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); -} - -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - if (delay == 0) - return queue_work(wq, &dwork->work); - - return queue_delayed_work_on(-1, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/** - * queue_delayed_work_on - queue work on specific CPU after delay - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); - - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; - - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_delayed_work_on); - -static void run_workqueue(struct cpu_workqueue_struct *cwq) -{ - spin_lock_irq(&cwq->lock); - cwq->run_depth++; - if (cwq->run_depth > 3) { - /* morton gets to eat his hat */ - printk("%s: recursion depth exceeded: %d\n", - __func__, cwq->run_depth); - dump_stack(); - } - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. - */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); - } - - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; - } - cwq->run_depth--; - spin_unlock_irq(&cwq->lock); -} - -static int worker_thread(void *__cwq) -{ - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); - - if (cwq->wq->freezeable) - set_freezable(); - - set_user_nice(current, -5); - - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); - - try_to_freeze(); - - if (kthread_should_stop()) - break; - - run_workqueue(cwq); - } - - return 0; -} - -struct wq_barrier { - struct work_struct work; - struct completion done; -}; - -static void wq_barrier_func(struct work_struct *work) -{ - struct wq_barrier *barr = container_of(work, struct wq_barrier, work); - complete(&barr->done); -} - -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) -{ - INIT_WORK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - - init_completion(&barr->done); - - insert_work(cwq, &barr->work, head); -} - -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) -{ - int active; - - if (cwq->thread == current) { - /* - * Probably keventd trying to flush its own queue. So simply run - * it by hand rather than deadlocking. - */ - run_workqueue(cwq); - active = 1; - } else { - struct wq_barrier barr; - - active = 0; - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; - } - spin_unlock_irq(&cwq->lock); - - if (active) - wait_for_completion(&barr.done); - } - - return active; -} - -/** - * flush_workqueue - ensure that any scheduled work has run to completion. - * @wq: workqueue to flush - * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. - */ -void flush_workqueue(struct workqueue_struct *wq) -{ - const cpumask_t *cpu_map = wq_cpu_map(wq); - int cpu; - - might_sleep(); - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - for_each_cpu_mask_nr(cpu, *cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); -} -EXPORT_SYMBOL_GPL(flush_workqueue); - -/** - * flush_work - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns false if @work has already terminated. - * - * It is expected that, prior to calling flush_work(), the caller has - * arranged for the work to not be requeued, otherwise it doesn't make - * sense to use this function. - */ -int flush_work(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct list_head *prev; - struct wq_barrier barr; - - might_sleep(); - cwq = get_wq_data(work); - if (!cwq) - return 0; - - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); - if (!list_empty(&work->entry)) { - /* - * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. - */ - smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; - } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; - } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; - - wait_for_completion(&barr.done); - return 1; -} -EXPORT_SYMBOL_GPL(flush_work); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - - cwq = get_wq_data(work); - if (!cwq) - return ret; - - spin_lock_irq(&cwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong cwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (cwq == get_wq_data(work)) { - list_del_init(&work->entry); - ret = 1; - } - } - spin_unlock_irq(&cwq->lock); - - return ret; -} - -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) -{ - struct wq_barrier barr; - int running = 0; - - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); - - if (unlikely(running)) - wait_for_completion(&barr.done); -} - -static void wait_on_work(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const cpumask_t *cpu_map; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu_mask_nr(cpu, *cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); -} - -static int __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) -{ - int ret; - - do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); - } while (unlikely(ret < 0)); - - work_clear_pending(work); - return ret; -} - -/** - * cancel_work_sync - block until a work_struct's callback has terminated - * @work: the work which is to be flushed - * - * Returns true if @work was pending. - * - * cancel_work_sync() will cancel the work if it is queued. If the work's - * callback appears to be running, cancel_work_sync() will block until it - * has completed. - * - * It is possible to use this function if the work re-queues itself. It can - * cancel the work even if it migrates to another workqueue, however in that - * case it only guarantees that work->func() has completed on the last queued - * workqueue. - * - * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not - * pending, otherwise it goes into a busy-wait loop until the timer expires. - * - * The caller must ensure that workqueue_struct on which this work was last - * queued can't be destroyed before this function returns. - */ -int cancel_work_sync(struct work_struct *work) -{ - return __cancel_work_timer(work, NULL); -} -EXPORT_SYMBOL_GPL(cancel_work_sync); - -/** - * cancel_delayed_work_sync - reliably kill off a delayed work. - * @dwork: the delayed work struct - * - * Returns true if @dwork was pending. - * - * It is possible to use this function if @dwork rearms itself via queue_work() - * or queue_delayed_work(). See also the comment for cancel_work_sync(). - */ -int cancel_delayed_work_sync(struct delayed_work *dwork) -{ - return __cancel_work_timer(&dwork->work, &dwork->timer); -} -EXPORT_SYMBOL(cancel_delayed_work_sync); - -static struct workqueue_struct *keventd_wq __read_mostly; - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * This puts a job in the kernel-global workqueue. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(keventd_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -int schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, keventd_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(keventd_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_on_each_cpu - call a function on each online CPU from keventd - * @func: the function to call - * - * Returns zero on success. - * Returns -ve errno on failure. - * - * schedule_on_each_cpu() is very slow. - */ -int schedule_on_each_cpu(work_func_t func) -{ - int cpu; - struct work_struct *works; - - works = alloc_percpu(struct work_struct); - if (!works) - return -ENOMEM; - - get_online_cpus(); - for_each_online_cpu(cpu) { - struct work_struct *work = per_cpu_ptr(works, cpu); - - INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); - free_percpu(works); - return 0; -} - -void flush_scheduled_work(void) -{ - flush_workqueue(keventd_wq); -} -EXPORT_SYMBOL(flush_scheduled_work); - -/** - * execute_in_process_context - reliably execute the routine with user context - * @fn: the function to execute - * @ew: guaranteed storage for the execute work structure (must - * be available when the work executes) - * - * Executes the function immediately if process context is available, - * otherwise schedules the function for delayed execution. - * - * Returns: 0 - function was executed - * 1 - function was scheduled for execution - */ -int execute_in_process_context(work_func_t fn, struct execute_work *ew) -{ - if (!in_interrupt()) { - fn(&ew->work); - return 0; - } - - INIT_WORK(&ew->work, fn); - schedule_work(&ew->work); - - return 1; -} -EXPORT_SYMBOL_GPL(execute_in_process_context); - -int keventd_up(void) -{ - return keventd_wq != NULL; -} - -int current_is_keventd(void) -{ - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); - - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; - - return ret; - -} - -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) -{ - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); - - return cwq; -} - -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) -{ - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; - - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - - cwq->thread = p; - - return 0; -} - -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) -{ - struct task_struct *p = cwq->thread; - - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} - -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - return NULL; - - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } - - wq->name = name; - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - INIT_LIST_HEAD(&wq->list); - - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); - } - - if (err) { - destroy_workqueue(wq); - wq = NULL; - } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); - -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ - /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. - */ - if (cwq->thread == NULL) - return; - - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - kthread_stop(cwq->thread); - cwq->thread = NULL; -} - -/** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue - * - * Safely destroy a workqueue. All work currently pending will be done first. - */ -void destroy_workqueue(struct workqueue_struct *wq) -{ - const cpumask_t *cpu_map = wq_cpu_map(wq); - int cpu; - - cpu_maps_update_begin(); - spin_lock(&workqueue_lock); - list_del(&wq->list); - spin_unlock(&workqueue_lock); - - for_each_cpu_mask_nr(cpu, *cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); - - free_percpu(wq->cpu_wq); - kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int ret = NOTIFY_OK; - - action &= ~CPU_TASKS_FROZEN; - - switch (action) { - case CPU_UP_PREPARE: - cpu_set(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - if (!create_workqueue_thread(cwq, cpu)) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - ret = NOTIFY_BAD; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; - } - } - - switch (action) { - case CPU_UP_CANCELED: - case CPU_POST_DEAD: - cpu_clear(cpu, cpu_populated_map); - } - - return ret; -} - -void __init init_workqueues(void) -{ - cpu_populated_map = cpu_online_map; - singlethread_cpu = first_cpu(cpu_possible_map); - cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); -} diff --git a/integration-tests/src/test/resources/kernel33.txt b/integration-tests/src/test/resources/kernel33.txt deleted file mode 100644 index a4e0907..0000000 --- a/integration-tests/src/test/resources/kernel33.txt +++ /dev/null @@ -1,177408 +0,0 @@ -/* - * linux/kernel/acct.c - * - * BSD Process Accounting for Linux - * - * Author: Marco van Wieringen - * - * Some code based on ideas and code from: - * Thomas K. Dyas - * - * This file implements BSD-style process accounting. Whenever any - * process exits, an accounting record of type "struct acct" is - * written to the file specified with the acct() system call. It is - * up to user-level programs to do useful things with the accounting - * log. The kernel just provides the raw accounting information. - * - * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. - * - * Plugged two leaks. 1) It didn't return acct_file into the free_filps if - * the file happened to be read-only. 2) If the accounting was suspended - * due to the lack of space it happily allowed to reopen it and completely - * lost the old acct_file. 3/10/98, Al Viro. - * - * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). - * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. - * - * Fixed a nasty interaction with with sys_umount(). If the accointing - * was suspeneded we failed to stop it on umount(). Messy. - * Another one: remount to readonly didn't stop accounting. - * Question: what should we do if we have CAP_SYS_ADMIN but not - * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY - * unless we are messing with the root. In that case we are getting a - * real mess with do_remount_sb(). 9/11/98, AV. - * - * Fixed a bunch of races (and pair of leaks). Probably not the best way, - * but this one obviously doesn't introduce deadlocks. Later. BTW, found - * one race (and leak) in BSD implementation. - * OK, that's better. ANOTHER race and leak in BSD variant. There always - * is one more bug... 10/11/98, AV. - * - * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks - * a struct file opened for write. Fixed. 2/6/2000, AV. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* sector_div */ -#include - -/* - * These constants control the amount of freespace that suspend and - * resume the process accounting system, and the time delay between - * each check. - * Turned into sysctl-controllable parameters. AV, 12/11/98 - */ - -int acct_parm[3] = {4, 2, 30}; -#define RESUME (acct_parm[0]) /* >foo% free space - resume */ -#define SUSPEND (acct_parm[1]) /* active; - if (!file || time_is_before_jiffies(acct->needcheck)) - goto out; - spin_unlock(&acct_lock); - - /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act>0; - goto out; - } - - if (acct->active) { - if (act < 0) { - acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); - } - } else { - if (act > 0) { - acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); - } - } - - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; -out: - spin_unlock(&acct_lock); - return res; -} - -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) -{ - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); - } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; - list_add(&acct->list, &acct_list); - } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); - spin_lock(&acct_lock); - } -} - -static int acct_on(char *name) -{ - struct file *file; - struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; - - /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { - filp_close(file, NULL); - return -EACCES; - } - - if (!file->f_op->write) { - filp_close(file, NULL); - return -EIO; - } - - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } - } - - spin_lock(&acct_lock); - if (ns->bacct == NULL) { - ns->bacct = acct; - acct = NULL; - } - - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - - return 0; -} - -/** - * sys_acct - enable/disable process accounting - * @name: file name for accounting records or NULL to shutdown accounting - * - * Returns 0 for success or negative errno values for failure. - * - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. - */ -SYSCALL_DEFINE1(acct, const char __user *, name) -{ - int error = 0; - - if (!capable(CAP_SYS_PACCT)) - return -EPERM; - - if (name) { - char *tmp = getname(name); - if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); - error = acct_on(tmp); - putname(tmp); - } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - } - - return error; -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -void acct_exit_ns(struct pid_namespace *ns) -{ - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); -} - -/* - * encode an unsigned long into a comp_t - * - * This routine has been adopted from the encode_comp_t() function in - * the kern_acct.c file of the FreeBSD operating system. The encoding - * is a 13-bit fraction with a 3-bit (base 8) exponent. - */ - -#define MANTSIZE 13 /* 13 bit mantissa. */ -#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ -#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ - -static comp_t encode_comp_t(unsigned long value) -{ - int exp, rnd; - - exp = rnd = 0; - while (value > MAXFRACT) { - rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ - value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ - exp++; - } - - /* - * If we need to round up, do it (and handle overflow correctly). - */ - if (rnd && (++value > MAXFRACT)) { - value >>= EXPSIZE; - exp++; - } - - /* - * Clean it up and polish it off. - */ - exp <<= MANTSIZE; /* Shift the exponent into place */ - exp += value; /* and add on the mantissa. */ - return exp; -} - -#if ACCT_VERSION==1 || ACCT_VERSION==2 -/* - * encode an u64 into a comp2_t (24 bits) - * - * Format: 5 bit base 2 exponent, 20 bits mantissa. - * The leading bit of the mantissa is not stored, but implied for - * non-zero exponents. - * Largest encodable value is 50 bits. - */ - -#define MANTSIZE2 20 /* 20 bit mantissa. */ -#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ -#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < (MAXFRACT2>>1)); - rnd = 0; - while (value > MAXFRACT2) { - rnd = value & 1; - value >>= 1; - exp++; - } - - /* - * If we need to round up, do it (and handle overflow correctly). - */ - if (rnd && (++value > MAXFRACT2)) { - value >>= 1; - exp++; - } - - if (exp > MAXEXP2) { - /* Overflow. Return largest representable number instead. */ - return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; - } else { - return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); - } -} -#endif - -#if ACCT_VERSION==3 -/* - * encode an u64 into a 32 bit IEEE float - */ -static u32 encode_float(u64 value) -{ - unsigned exp = 190; - unsigned u; - - if (value==0) return 0; - while ((s64)value > 0){ - value <<= 1; - exp--; - } - u = (u32)(value >> 40) & 0x7fffffu; - return u | (exp << 23); -} -#endif - -/* - * Write an accounting entry for an exiting process - * - * The acct_process() call is the workhorse of the process - * accounting system. The struct acct is built here and then written - * into the accounting file. This function should only be called from - * do_exit() or when switching to a different output file. - */ - -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) -{ - struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - mm_segment_t fs; - unsigned long flim; - u64 elapsed; - u64 run_time; - struct timespec uptime; - struct tty_struct *tty; - const struct cred *orig_cred; - - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; - - /* - * Fill the accounting struct with the needed info as recorded - * by the different kernel functions. - */ - memset(&ac, 0, sizeof(acct_t)); - - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); - - /* calculate run_time in nsec*/ - do_posix_clock_monotonic_gettime(&uptime); - run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC - + current->group_leader->start_time.tv_nsec; - /* convert nsec -> AHZ */ - elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); -#else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 - { - /* new enlarged etime field */ - comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; - } -#endif - do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; - /* we really need to bite the bullet and change layout */ - ac.ac_uid = orig_cred->uid; - ac.ac_gid = orig_cred->gid; -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 - /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; -#endif -#if ACCT_VERSION==3 - ac.ac_pid = task_tgid_nr_ns(current, ns); - rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); - rcu_read_unlock(); -#endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); - - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); -out: - revert_creds(orig_cred); -} - -/** - * acct_collect - collect accounting information into pacct_struct - * @exitcode: task exit code - * @group_dead: not 0, if this thread is the last one in the process. - */ -void acct_collect(long exitcode, int group_dead) -{ - struct pacct_struct *pacct = ¤t->signal->pacct; - unsigned long vsize = 0; - - if (group_dead && current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - - spin_lock_irq(¤t->sighand->siglock); - if (group_dead) - pacct->ac_mem = vsize / 1024; - if (thread_group_leader(current)) { - pacct->ac_exitcode = exitcode; - if (current->flags & PF_FORKNOEXEC) - pacct->ac_flag |= AFORK; - } - if (current->flags & PF_SUPERPRIV) - pacct->ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - pacct->ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; - pacct->ac_minflt += current->min_flt; - pacct->ac_majflt += current->maj_flt; - spin_unlock_irq(¤t->sighand->siglock); -} - -static void acct_process_in_ns(struct pid_namespace *ns) -{ - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - return; - } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); -} - -/** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. - * - * handles process accounting for an exiting task - */ -void acct_process(void) -{ - struct pid_namespace *ns; - - /* - * This loop is safe lockless, since current is still - * alive and holds its namespace, which in turn holds - * its parent. - */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); -} -/* - * async.c: Asynchronous function calls for boot performance - * - * (C) Copyright 2009 Intel Corporation - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - - -/* - -Goals and Theory of Operation - -The primary goal of this feature is to reduce the kernel boot time, -by doing various independent hardware delays and discovery operations -decoupled and not strictly serialized. - -More specifically, the asynchronous function call concept allows -certain operations (primarily during system boot) to happen -asynchronously, out of order, while these operations still -have their externally visible parts happen sequentially and in-order. -(not unlike how out-of-order CPUs retire their instructions in order) - -Key to the asynchronous function call implementation is the concept of -a "sequence cookie" (which, although it has an abstracted type, can be -thought of as a monotonically incrementing number). - -The async core will assign each scheduled event such a sequence cookie and -pass this to the called functions. - -The asynchronously called function should before doing a globally visible -operation, such as registering device numbers, call the -async_synchronize_cookie() function and pass in its own cookie. The -async_synchronize_cookie() function will make sure that all asynchronous -operations that were scheduled prior to the operation corresponding with the -cookie have completed. - -Subsystem/driver initialization code that scheduled asynchronous probe -functions, but which shares global resources with other drivers/subsystems -that do not use the asynchronous call feature, need to do a full -synchronization with the async_synchronize_full() function, before returning -from their init function. This is to maintain strict ordering between the -asynchronous and synchronous parts of the kernel. - -*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -static async_cookie_t next_cookie = 1; - -#define MAX_WORK 32768 - -static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); -static DEFINE_SPINLOCK(async_lock); - -struct async_entry { - struct list_head list; - struct work_struct work; - async_cookie_t cookie; - async_func_ptr *func; - void *data; - struct list_head *running; -}; - -static DECLARE_WAIT_QUEUE_HEAD(async_done); - -static atomic_t entry_count; - - -/* - * MUST be called with the lock held! - */ -static async_cookie_t __lowest_in_progress(struct list_head *running) -{ - struct async_entry *entry; - - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); - return entry->cookie; - } - - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) - return entry->cookie; - - return next_cookie; /* "infinity" value */ -} - -static async_cookie_t lowest_in_progress(struct list_head *running) -{ - unsigned long flags; - async_cookie_t ret; - - spin_lock_irqsave(&async_lock, flags); - ret = __lowest_in_progress(running); - spin_unlock_irqrestore(&async_lock, flags); - return ret; -} - -/* - * pick the first pending entry and run it - */ -static void async_run_entry_fn(struct work_struct *work) -{ - struct async_entry *entry = - container_of(work, struct async_entry, work); - unsigned long flags; - ktime_t uninitialized_var(calltime), delta, rettime; - - /* 1) move self to the running queue */ - spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); - spin_unlock_irqrestore(&async_lock, flags); - - /* 2) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "calling %lli_%pF @ %i\n", - (long long)entry->cookie, - entry->func, task_pid_nr(current)); - calltime = ktime_get(); - } - entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", - (long long)entry->cookie, - entry->func, - (long long)ktime_to_ns(delta) >> 10); - } - - /* 3) remove self from the running queue */ - spin_lock_irqsave(&async_lock, flags); - list_del(&entry->list); - - /* 4) free the entry */ - kfree(entry); - atomic_dec(&entry_count); - - spin_unlock_irqrestore(&async_lock, flags); - - /* 5) wake up any waiters */ - wake_up(&async_done); -} - -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) -{ - struct async_entry *entry; - unsigned long flags; - async_cookie_t newcookie; - - /* allow irq-off callers */ - entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); - - /* - * If we're out of memory or if there's too much work - * pending already, we execute synchronously. - */ - if (!entry || atomic_read(&entry_count) > MAX_WORK) { - kfree(entry); - spin_lock_irqsave(&async_lock, flags); - newcookie = next_cookie++; - spin_unlock_irqrestore(&async_lock, flags); - - /* low on memory.. run synchronously */ - ptr(data, newcookie); - return newcookie; - } - INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = ptr; - entry->data = data; - entry->running = running; - - spin_lock_irqsave(&async_lock, flags); - newcookie = entry->cookie = next_cookie++; - list_add_tail(&entry->list, &async_pending); - atomic_inc(&entry_count); - spin_unlock_irqrestore(&async_lock, flags); - - /* schedule for execution */ - queue_work(system_unbound_wq, &entry->work); - - return newcookie; -} - -/** - * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously - * @data: data pointer to pass to the function - * - * Returns an async_cookie_t that may be used for checkpointing later. - * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) -{ - return __async_schedule(ptr, data, &async_running); -} -EXPORT_SYMBOL_GPL(async_schedule); - -/** - * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously - * @data: data pointer to pass to the function - * @running: running list for the domain - * - * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_domain() functions - * to wait within a certain synchronization domain rather than globally. - * A synchronization domain is specified via the running queue @running to use. - * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) -{ - return __async_schedule(ptr, data, running); -} -EXPORT_SYMBOL_GPL(async_schedule_domain); - -/** - * async_synchronize_full - synchronize all asynchronous function calls - * - * This function waits until all asynchronous function calls have been done. - */ -void async_synchronize_full(void) -{ - do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); -} -EXPORT_SYMBOL_GPL(async_synchronize_full); - -/** - * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on - * - * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. - */ -void async_synchronize_full_domain(struct list_head *list) -{ - async_synchronize_cookie_domain(next_cookie, list); -} -EXPORT_SYMBOL_GPL(async_synchronize_full_domain); - -/** - * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing - * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on - * - * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted - * prior to @cookie have been done. - */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) -{ - ktime_t uninitialized_var(starttime), delta, endtime; - - if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); - starttime = ktime_get(); - } - - wait_event(async_done, lowest_in_progress(running) >= cookie); - - if (initcall_debug && system_state == SYSTEM_BOOTING) { - endtime = ktime_get(); - delta = ktime_sub(endtime, starttime); - - printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", - task_pid_nr(current), - (long long)ktime_to_ns(delta) >> 10); - } -} -EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); - -/** - * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing - * @cookie: async_cookie_t to use as checkpoint - * - * This function waits until all asynchronous function calls prior to @cookie - * have been done. - */ -void async_synchronize_cookie(async_cookie_t cookie) -{ - async_synchronize_cookie_domain(cookie, &async_running); -} -EXPORT_SYMBOL_GPL(async_synchronize_cookie); -/* audit.c -- Auditing support - * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. - * System-call specific features have moved to auditsc.c - * - * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Written by Rickard E. (Rik) Faith - * - * Goals: 1) Integrate fully with Security Modules. - * 2) Minimal run-time overhead: - * a) Minimal when syscall auditing is disabled (audit_enable=0). - * b) Small when syscall auditing is enabled and no audit record - * is generated (defer as much work as possible to record - * generation time): - * i) context is allocated, - * ii) names from getname are stored without a copy, and - * iii) inode information stored from path_lookup. - * 3) Ability to disable syscall auditing at boot time (audit=0). - * 4) Usable by other parts of the kernel (if audit_log* is called, - * then a syscall record will be generated automatically for the - * current syscall). - * 5) Netlink interface to user-space. - * 6) Support low-overhead kernel-based filtering to minimize the - * information that must be passed to user-space. - * - * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#ifdef CONFIG_SECURITY -#include -#endif -#include -#include -#include - -#include "audit.h" - -/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. - * (Initialization happens after skb_init is called.) */ -#define AUDIT_DISABLED -1 -#define AUDIT_UNINITIALIZED 0 -#define AUDIT_INITIALIZED 1 -static int audit_initialized; - -#define AUDIT_OFF 0 -#define AUDIT_ON 1 -#define AUDIT_LOCKED 2 -int audit_enabled; -int audit_ever_enabled; - -EXPORT_SYMBOL_GPL(audit_enabled); - -/* Default state when kernel boots without any parameters. */ -static int audit_default; - -/* If auditing cannot proceed, audit_failure selects what happens. */ -static int audit_failure = AUDIT_FAIL_PRINTK; - -/* - * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. - */ -int audit_pid; -static int audit_nlk_pid; - -/* If audit_rate_limit is non-zero, limit the rate of sending audit records - * to that number per second. This prevents DoS attacks, but results in - * audit records being dropped. */ -static int audit_rate_limit; - -/* Number of outstanding audit_buffers allowed. */ -static int audit_backlog_limit = 64; -static int audit_backlog_wait_time = 60 * HZ; -static int audit_backlog_wait_overflow = 0; - -/* The identity of the user shutting down the audit system. */ -uid_t audit_sig_uid = -1; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; - -/* Records can be lost in several ways: - 0) [suppressed in audit_alloc] - 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] - 2) out of memory in audit_log_move [alloc_skb] - 3) suppressed due to audit_rate_limit - 4) suppressed due to audit_backlog_limit -*/ -static atomic_t audit_lost = ATOMIC_INIT(0); - -/* The netlink socket. */ -static struct sock *audit_sock; - -/* Hash for inode-based rules */ -struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; - -/* The audit_freelist is a list of pre-allocated audit buffers (if more - * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of - * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count; -static LIST_HEAD(audit_freelist); - -static struct sk_buff_head audit_skb_queue; -/* queue of skbs to send to auditd when/if it comes back */ -static struct sk_buff_head audit_skb_hold_queue; -static struct task_struct *kauditd_task; -static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); -static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); - -/* Serialize requests from userspace. */ -DEFINE_MUTEX(audit_cmd_mutex); - -/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting - * audit records. Since printk uses a 1024 byte buffer, this buffer - * should be at least that large. */ -#define AUDIT_BUFSIZ 1024 - -/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the - * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ -#define AUDIT_MAXFREE (2*NR_CPUS) - -/* The audit_buffer is used when formatting an audit record. The caller - * locks briefly to get the record off the freelist or to allocate the - * buffer, and locks briefly to send the buffer to the netlink layer or - * to place it on a transmit queue. Multiple audit_buffers can be in - * use simultaneously. */ -struct audit_buffer { - struct list_head list; - struct sk_buff *skb; /* formatted skb ready to send */ - struct audit_context *ctx; /* NULL or associated context */ - gfp_t gfp_mask; -}; - -struct audit_reply { - int pid; - struct sk_buff *skb; -}; - -static void audit_set_pid(struct audit_buffer *ab, pid_t pid) -{ - if (ab) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_pid = pid; - } -} - -void audit_panic(const char *message) -{ - switch (audit_failure) - { - case AUDIT_FAIL_SILENT: - break; - case AUDIT_FAIL_PRINTK: - if (printk_ratelimit()) - printk(KERN_ERR "audit: %s\n", message); - break; - case AUDIT_FAIL_PANIC: - /* test audit_pid since printk is always losey, why bother? */ - if (audit_pid) - panic("audit: %s\n", message); - break; - } -} - -static inline int audit_rate_check(void) -{ - static unsigned long last_check = 0; - static int messages = 0; - static DEFINE_SPINLOCK(lock); - unsigned long flags; - unsigned long now; - unsigned long elapsed; - int retval = 0; - - if (!audit_rate_limit) return 1; - - spin_lock_irqsave(&lock, flags); - if (++messages < audit_rate_limit) { - retval = 1; - } else { - now = jiffies; - elapsed = now - last_check; - if (elapsed > HZ) { - last_check = now; - messages = 0; - retval = 1; - } - } - spin_unlock_irqrestore(&lock, flags); - - return retval; -} - -/** - * audit_log_lost - conditionally log lost audit message event - * @message: the message stating reason for lost audit message - * - * Emit at least 1 message per second, even if audit_rate_check is - * throttling. - * Always increment the lost messages counter. -*/ -void audit_log_lost(const char *message) -{ - static unsigned long last_msg = 0; - static DEFINE_SPINLOCK(lock); - unsigned long flags; - unsigned long now; - int print; - - atomic_inc(&audit_lost); - - print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); - - if (!print) { - spin_lock_irqsave(&lock, flags); - now = jiffies; - if (now - last_msg > HZ) { - print = 1; - last_msg = now; - } - spin_unlock_irqrestore(&lock, flags); - } - - if (print) { - if (printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_lost=%d audit_rate_limit=%d " - "audit_backlog_limit=%d\n", - atomic_read(&audit_lost), - audit_rate_limit, - audit_backlog_limit); - audit_panic(message); - } -} - -static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sessionid, u32 sid, - int allow_changes) -{ - struct audit_buffer *ab; - int rc = 0; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, loginuid, sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) { - audit_log_format(ab, " sid=%u", sid); - allow_changes = 0; /* Something weird, deny request */ - } else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " res=%d", allow_changes); - audit_log_end(ab); - return rc; -} - -static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sessionid, - u32 sid) -{ - int allow_changes, rc = 0, old = *to_change; - - /* check if we are locked */ - if (audit_enabled == AUDIT_LOCKED) - allow_changes = 0; - else - allow_changes = 1; - - if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, loginuid, - sessionid, sid, allow_changes); - if (rc) - allow_changes = 0; - } - - /* If we are allowed, make the change */ - if (allow_changes == 1) - *to_change = new; - /* Not allowed, update reason */ - else if (rc == 0) - rc = -EPERM; - return rc; -} - -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, - u32 sid) -{ - return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sessionid, sid); -} - -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, - u32 sid) -{ - return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sessionid, sid); -} - -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) -{ - int rc; - if (state < AUDIT_OFF || state > AUDIT_LOCKED) - return -EINVAL; - - rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sessionid, sid); - - if (!rc) - audit_ever_enabled |= !!state; - - return rc; -} - -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) -{ - if (state != AUDIT_FAIL_SILENT - && state != AUDIT_FAIL_PRINTK - && state != AUDIT_FAIL_PANIC) - return -EINVAL; - - return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sessionid, sid); -} - -/* - * Queue skbs to be sent to auditd when/if it comes back. These skbs should - * already have been sent via prink/syslog and so if these messages are dropped - * it is not a huge concern since we already passed the audit_log_lost() - * notification and stuff. This is just nice to get audit messages during - * boot before auditd is running or messages generated while auditd is stopped. - * This only holds messages is audit_default is set, aka booting with audit=1 - * or building your kernel that way. - */ -static void audit_hold_skb(struct sk_buff *skb) -{ - if (audit_default && - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) - skb_queue_tail(&audit_skb_hold_queue, skb); - else - kfree_skb(skb); -} - -/* - * For one reason or another this nlh isn't getting delivered to the userspace - * audit daemon, just send it to printk. - */ -static void audit_printk_skb(struct sk_buff *skb) -{ - struct nlmsghdr *nlh = nlmsg_hdr(skb); - char *data = NLMSG_DATA(nlh); - - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) - printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); - else - audit_log_lost("printk limit exceeded\n"); - } - - audit_hold_skb(skb); -} - -static void kauditd_send_skb(struct sk_buff *skb) -{ - int err; - /* take a reference in case we can't send it and we want to hold it */ - skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); - if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ - printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared\n"); - audit_pid = 0; - /* we might get lucky and get this in the next auditd */ - audit_hold_skb(skb); - } else - /* drop the extra reference if sent ok */ - consume_skb(skb); -} - -static int kauditd_thread(void *dummy) -{ - struct sk_buff *skb; - - set_freezable(); - while (!kthread_should_stop()) { - /* - * if auditd just started drain the queue of messages already - * sent to syslog/printk. remember loss here is ok. we already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. - * - * if you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. - */ - if (audit_default && audit_pid) { - skb = skb_dequeue(&audit_skb_hold_queue); - if (unlikely(skb)) { - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } - } - } - - skb = skb_dequeue(&audit_skb_queue); - wake_up(&audit_backlog_wait); - if (skb) { - if (audit_pid) - kauditd_send_skb(skb); - else - audit_printk_skb(skb); - } else { - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); - } - } - return 0; -} - -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ - struct task_struct *tsk; - int err; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(tsk); - rcu_read_unlock(); - err = tty_audit_push_task(tsk, loginuid, sessionid); - put_task_struct(tsk); - return err; -} - -int audit_send_list(void *_dest) -{ - struct audit_netlink_list *dest = _dest; - int pid = dest->pid; - struct sk_buff *skb; - - /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); - - while ((skb = __skb_dequeue(&dest->q)) != NULL) - netlink_unicast(audit_sock, skb, pid, 0); - - kfree(dest); - - return 0; -} - -struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, const void *payload, int size) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = nlmsg_new(size, GFP_KERNEL); - if (!skb) - return NULL; - - nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - return skb; - -nlmsg_failure: /* Used by NLMSG_NEW */ - if (skb) - kfree_skb(skb); - return NULL; -} - -static int audit_send_reply_thread(void *arg) -{ - struct audit_reply *reply = (struct audit_reply *)arg; - - mutex_lock(&audit_cmd_mutex); - mutex_unlock(&audit_cmd_mutex); - - /* Ignore failure. It'll only happen if the sender goes away, - because our timeout is set to infinite. */ - netlink_unicast(audit_sock, reply->skb, reply->pid, 0); - kfree(reply); - return 0; -} -/** - * audit_send_reply - send an audit reply message via netlink - * @pid: process id to send reply to - * @seq: sequence number - * @type: audit message type - * @done: done (last) flag - * @multi: multi-part message flag - * @payload: payload data - * @size: payload size - * - * Allocates an skb, builds the netlink message, and sends it to the pid. - * No failure notifications. - */ -static void audit_send_reply(int pid, int seq, int type, int done, int multi, - const void *payload, int size) -{ - struct sk_buff *skb; - struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); - - if (!reply) - return; - - skb = audit_make_reply(pid, seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->pid = pid; - reply->skb = skb; - - tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); -} - -/* - * Check for appropriate CAP_AUDIT_ capabilities on incoming audit - * control messages. - */ -static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) -{ - int err = 0; - - switch (msg_type) { - case AUDIT_GET: - case AUDIT_LIST: - case AUDIT_LIST_RULES: - case AUDIT_SET: - case AUDIT_ADD: - case AUDIT_ADD_RULE: - case AUDIT_DEL: - case AUDIT_DEL_RULE: - case AUDIT_SIGNAL_INFO: - case AUDIT_TTY_GET: - case AUDIT_TTY_SET: - case AUDIT_TRIM: - case AUDIT_MAKE_EQUIV: - if (!capable(CAP_AUDIT_CONTROL)) - err = -EPERM; - break; - case AUDIT_USER: - case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!capable(CAP_AUDIT_WRITE)) - err = -EPERM; - break; - default: /* bad msg */ - err = -EINVAL; - } - - return err; -} - -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 ses, - u32 sid) -{ - int rc = 0; - char *ctx = NULL; - u32 len; - - if (!audit_enabled) { - *ab = NULL; - return rc; - } - - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - pid, uid, auid, ses); - if (sid) { - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) - audit_log_format(*ab, " ssid=%u", sid); - else { - audit_log_format(*ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - return rc; -} - -static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - u32 uid, pid, seq, sid; - void *data; - struct audit_status *status_get, status_set; - int err; - struct audit_buffer *ab; - u16 msg_type = nlh->nlmsg_type; - uid_t loginuid; /* loginuid of sender */ - u32 sessionid; - struct audit_sig_info *sig_data; - char *ctx = NULL; - u32 len; - - err = audit_netlink_ok(skb, msg_type); - if (err) - return err; - - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; - loginuid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &sid); - seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); - - switch (msg_type) { - case AUDIT_GET: - status_set.enabled = audit_enabled; - status_set.failure = audit_failure; - status_set.pid = audit_pid; - status_set.rate_limit = audit_rate_limit; - status_set.backlog_limit = audit_backlog_limit; - status_set.lost = atomic_read(&audit_lost); - status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, - &status_set, sizeof(status_set)); - break; - case AUDIT_SET: - if (nlh->nlmsg_len < sizeof(struct audit_status)) - return -EINVAL; - status_get = (struct audit_status *)data; - if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_PID) { - int new_pid = status_get->pid; - - if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - audit_pid, loginuid, - sessionid, sid, 1); - - audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; - } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sessionid, sid); - if (err < 0) - return err; - } - if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sessionid, sid); - break; - case AUDIT_USER: - case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!audit_enabled && msg_type != AUDIT_USER_AVC) - return 0; - - err = audit_filter_user(&NETLINK_CB(skb)); - if (err == 1) { - err = 0; - if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid, - sessionid); - if (err) - break; - } - audit_log_common_recv_msg(&ab, msg_type, pid, uid, - loginuid, sessionid, sid); - - if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.1024s'", - (char *)data); - else { - int size; - - audit_log_format(ab, " msg="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); - } - audit_set_pid(ab, pid); - audit_log_end(ab); - } - break; - case AUDIT_ADD: - case AUDIT_DEL: - if (nlmsg_len(nlh) < sizeof(struct audit_rule)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; - case AUDIT_ADD_RULE: - case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, - uid, seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; - case AUDIT_TRIM: - audit_trim_trees(); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " op=trim res=1"); - audit_log_end(ab); - break; - case AUDIT_MAKE_EQUIV: { - void *bufp = data; - u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); - char *old, *new; - - err = -EINVAL; - if (msglen < 2 * sizeof(u32)) - break; - memcpy(sizes, bufp, 2 * sizeof(u32)); - bufp += 2 * sizeof(u32); - msglen -= 2 * sizeof(u32); - old = audit_unpack_string(&bufp, &msglen, sizes[0]); - if (IS_ERR(old)) { - err = PTR_ERR(old); - break; - } - new = audit_unpack_string(&bufp, &msglen, sizes[1]); - if (IS_ERR(new)) { - err = PTR_ERR(new); - kfree(old); - break; - } - /* OK, here comes... */ - err = audit_tag_tree(old, new); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sessionid, sid); - - audit_log_format(ab, " op=make_equiv old="); - audit_log_untrustedstring(ab, old); - audit_log_format(ab, " new="); - audit_log_untrustedstring(ab, new); - audit_log_format(ab, " res=%d", !err); - audit_log_end(ab); - kfree(old); - kfree(new); - break; - } - case AUDIT_SIGNAL_INFO: - len = 0; - if (audit_sig_sid) { - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; - } - sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); - if (!sig_data) { - if (audit_sig_sid) - security_release_secctx(ctx, len); - return -ENOMEM; - } - sig_data->uid = audit_sig_uid; - sig_data->pid = audit_sig_pid; - if (audit_sig_sid) { - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); - } - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); - kfree(sig_data); - break; - case AUDIT_TTY_GET: { - struct audit_tty_status s; - struct task_struct *tsk; - unsigned long flags; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - s.enabled = tsk->signal->audit_tty != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); - - if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); - break; - } - case AUDIT_TTY_SET: { - struct audit_tty_status *s; - struct task_struct *tsk; - unsigned long flags; - - if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) - return -EINVAL; - s = data; - if (s->enabled != 0 && s->enabled != 1) - return -EINVAL; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk && lock_task_sighand(tsk, &flags)) { - tsk->signal->audit_tty = s->enabled != 0; - unlock_task_sighand(tsk, &flags); - } else - err = -ESRCH; - rcu_read_unlock(); - break; - } - default: - err = -EINVAL; - break; - } - - return err < 0 ? err : 0; -} - -/* - * Get message from skb. Each message is processed by audit_receive_msg. - * Malformed skbs with wrong length are discarded silently. - */ -static void audit_receive_skb(struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - /* - * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 - * if the nlmsg_len was not aligned - */ - int len; - int err; - - nlh = nlmsg_hdr(skb); - len = skb->len; - - while (NLMSG_OK(nlh, len)) { - err = audit_receive_msg(skb, nlh); - /* if err or if this message says it wants a response */ - if (err || (nlh->nlmsg_flags & NLM_F_ACK)) - netlink_ack(skb, nlh, err); - - nlh = NLMSG_NEXT(nlh, len); - } -} - -/* Receive messages from netlink socket. */ -static void audit_receive(struct sk_buff *skb) -{ - mutex_lock(&audit_cmd_mutex); - audit_receive_skb(skb); - mutex_unlock(&audit_cmd_mutex); -} - -/* Initialize audit support at boot time. */ -static int __init audit_init(void) -{ - int i; - - if (audit_initialized == AUDIT_DISABLED) - return 0; - - printk(KERN_INFO "audit: initializing netlink socket (%s)\n", - audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); - if (!audit_sock) - audit_panic("cannot initialize netlink socket"); - else - audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - - skb_queue_head_init(&audit_skb_queue); - skb_queue_head_init(&audit_skb_hold_queue); - audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; - - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); - - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) - INIT_LIST_HEAD(&audit_inode_hash[i]); - - return 0; -} -__initcall(audit_init); - -/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ -static int __init audit_enable(char *str) -{ - audit_default = !!simple_strtol(str, NULL, 0); - if (!audit_default) - audit_initialized = AUDIT_DISABLED; - - printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); - - if (audit_initialized == AUDIT_INITIALIZED) { - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; - } else if (audit_initialized == AUDIT_UNINITIALIZED) { - printk(" (after initialization)"); - } else { - printk(" (until reboot)"); - } - printk("\n"); - - return 1; -} - -__setup("audit=", audit_enable); - -static void audit_buffer_free(struct audit_buffer *ab) -{ - unsigned long flags; - - if (!ab) - return; - - if (ab->skb) - kfree_skb(ab->skb); - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else { - audit_freelist_count++; - list_add(&ab->list, &audit_freelist); - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); -} - -static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - gfp_t gfp_mask, int type) -{ - unsigned long flags; - struct audit_buffer *ab = NULL; - struct nlmsghdr *nlh; - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) { - ab = kmalloc(sizeof(*ab), gfp_mask); - if (!ab) - goto err; - } - - ab->ctx = ctx; - ab->gfp_mask = gfp_mask; - - ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); - if (!ab->skb) - goto nlmsg_failure; - - nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); - - return ab; - -nlmsg_failure: /* Used by NLMSG_NEW */ - kfree_skb(ab->skb); - ab->skb = NULL; -err: - audit_buffer_free(ab); - return NULL; -} - -/** - * audit_serial - compute a serial number for the audit record - * - * Compute a serial number for the audit record. Audit records are - * written to user-space as soon as they are generated, so a complete - * audit record may be written in several pieces. The timestamp of the - * record and this serial number are used by the user-space tools to - * determine which pieces belong to the same audit record. The - * (timestamp,serial) tuple is unique for each syscall and is live from - * syscall entry to syscall exit. - * - * NOTE: Another possibility is to store the formatted records off the - * audit context (for those records that have a context), and emit them - * all at syscall exit. However, this could delay the reporting of - * significant errors until syscall exit (or never, if the system - * halts). - */ -unsigned int audit_serial(void) -{ - static DEFINE_SPINLOCK(serial_lock); - static unsigned int serial = 0; - - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&serial_lock, flags); - do { - ret = ++serial; - } while (unlikely(!ret)); - spin_unlock_irqrestore(&serial_lock, flags); - - return ret; -} - -static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) -{ - if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = CURRENT_TIME; - *serial = audit_serial(); - } -} - -/* Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, tsk - * should be NULL. */ - -/** - * audit_log_start - obtain an audit buffer - * @ctx: audit_context (may be NULL) - * @gfp_mask: type of allocation - * @type: audit message type - * - * Returns audit_buffer pointer on success or NULL on error. - * - * Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the task (ctx) is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, then - * task context (ctx) should be NULL. - */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, - int type) -{ - struct audit_buffer *ab = NULL; - struct timespec t; - unsigned int uninitialized_var(serial); - int reserve; - unsigned long timeout_start = jiffies; - - if (audit_initialized != AUDIT_INITIALIZED) - return NULL; - - if (unlikely(audit_filter_type(type))) - return NULL; - - if (gfp_mask & __GFP_WAIT) - reserve = 0; - else - reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ - - while (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time - && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { - - /* Wait for auditd to drain the queue a little */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); - continue; - } - if (audit_rate_check() && printk_ratelimit()) - printk(KERN_WARNING - "audit: audit_backlog=%d > " - "audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), - audit_backlog_limit); - audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; - wake_up(&audit_backlog_wait); - return NULL; - } - - ab = audit_buffer_alloc(ctx, gfp_mask, type); - if (!ab) { - audit_log_lost("out of memory in audit_log_start"); - return NULL; - } - - audit_get_stamp(ab->ctx, &t, &serial); - - audit_log_format(ab, "audit(%lu.%03lu:%u): ", - t.tv_sec, t.tv_nsec/1000000, serial); - return ab; -} - -/** - * audit_expand - expand skb in the audit buffer - * @ab: audit_buffer - * @extra: space to add at tail of the skb - * - * Returns 0 (no space) on failed expansion, or available space if - * successful. - */ -static inline int audit_expand(struct audit_buffer *ab, int extra) -{ - struct sk_buff *skb = ab->skb; - int oldtail = skb_tailroom(skb); - int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); - int newtail = skb_tailroom(skb); - - if (ret < 0) { - audit_log_lost("out of memory in audit_expand"); - return 0; - } - - skb->truesize += newtail - oldtail; - return newtail; -} - -/* - * Format an audit message into the audit buffer. If there isn't enough - * room in the audit buffer, more room will be allocated and vsnprint - * will be called a second time. Currently, we assume that a printk - * can't format message larger than 1024 bytes, so we don't either. - */ -static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, - va_list args) -{ - int len, avail; - struct sk_buff *skb; - va_list args2; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - if (avail == 0) { - avail = audit_expand(ab, AUDIT_BUFSIZ); - if (!avail) - goto out; - } - va_copy(args2, args); - len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); - if (len >= avail) { - /* The printk buffer is 1024 bytes long, so if we get - * here and AUDIT_BUFSIZ is at least 1024, then we can - * log everything that printk could have logged. */ - avail = audit_expand(ab, - max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); - if (!avail) - goto out_va_end; - len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); - } - if (len > 0) - skb_put(skb, len); -out_va_end: - va_end(args2); -out: - return; -} - -/** - * audit_log_format - format a message into the audit buffer. - * @ab: audit_buffer - * @fmt: format string - * @...: optional parameters matching @fmt string - * - * All the work is done in audit_log_vformat. - */ -void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) -{ - va_list args; - - if (!ab) - return; - va_start(args, fmt); - audit_log_vformat(ab, fmt, args); - va_end(args); -} - -/** - * audit_log_hex - convert a buffer to hex and append it to the audit skb - * @ab: the audit_buffer - * @buf: buffer to convert to hex - * @len: length of @buf to be converted - * - * No return value; failure to expand is silently ignored. - * - * This function will take the passed buf and convert it into a string of - * ascii hex digits. The new string is placed onto the skb. - */ -void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, - size_t len) -{ - int i, avail, new_len; - unsigned char *ptr; - struct sk_buff *skb; - static const unsigned char *hex = "0123456789ABCDEF"; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - new_len = len<<1; - if (new_len >= avail) { - /* Round the buffer request up to the next multiple */ - new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); - avail = audit_expand(ab, new_len); - if (!avail) - return; - } - - ptr = skb_tail_pointer(skb); - for (i=0; i>4]; /* Upper nibble */ - *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ - } - *ptr = 0; - skb_put(skb, len << 1); /* new string is twice the old string */ -} - -/* - * Format a string of no more than slen characters into the audit buffer, - * enclosed in quote marks. - */ -void audit_log_n_string(struct audit_buffer *ab, const char *string, - size_t slen) -{ - int avail, new_len; - unsigned char *ptr; - struct sk_buff *skb; - - if (!ab) - return; - - BUG_ON(!ab->skb); - skb = ab->skb; - avail = skb_tailroom(skb); - new_len = slen + 3; /* enclosing quotes + null terminator */ - if (new_len > avail) { - avail = audit_expand(ab, new_len); - if (!avail) - return; - } - ptr = skb_tail_pointer(skb); - *ptr++ = '"'; - memcpy(ptr, string, slen); - ptr += slen; - *ptr++ = '"'; - *ptr = 0; - skb_put(skb, slen + 2); /* don't include null terminator */ -} - -/** - * audit_string_contains_control - does a string need to be logged in hex - * @string: string to be checked - * @len: max length of the string to check - */ -int audit_string_contains_control(const char *string, size_t len) -{ - const unsigned char *p; - for (p = string; p < (const unsigned char *)string + len; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7e) - return 1; - } - return 0; -} - -/** - * audit_log_n_untrustedstring - log a string that may contain random characters - * @ab: audit_buffer - * @len: length of string (not including trailing null) - * @string: string to be logged - * - * This code will escape a string that is passed to it if the string - * contains a control character, unprintable character, double quote mark, - * or a space. Unescaped strings will start and end with a double quote mark. - * Strings that are escaped are printed in hex (2 digits per char). - * - * The caller specifies the number of characters in the string to log, which may - * or may not be the entire string. - */ -void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, - size_t len) -{ - if (audit_string_contains_control(string, len)) - audit_log_n_hex(ab, string, len); - else - audit_log_n_string(ab, string, len); -} - -/** - * audit_log_untrustedstring - log a string that may contain random characters - * @ab: audit_buffer - * @string: string to be logged - * - * Same as audit_log_n_untrustedstring(), except that strlen is used to - * determine string length. - */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) -{ - audit_log_n_untrustedstring(ab, string, strlen(string)); -} - -/* This is a helper-function to print the escaped d_path */ -void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) -{ - char *p, *pathname; - - if (prefix) - audit_log_format(ab, "%s", prefix); - - /* We will allow 11 spaces for ' (deleted)' to be appended */ - pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); - if (!pathname) { - audit_log_string(ab, ""); - return; - } - p = d_path(path, pathname, PATH_MAX+11); - if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ - /* FIXME: can we save some information here? */ - audit_log_string(ab, ""); - } else - audit_log_untrustedstring(ab, p); - kfree(pathname); -} - -void audit_log_key(struct audit_buffer *ab, char *key) -{ - audit_log_format(ab, " key="); - if (key) - audit_log_untrustedstring(ab, key); - else - audit_log_format(ab, "(null)"); -} - -/** - * audit_log_end - end one audit record - * @ab: the audit_buffer - * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context. May be called in - * any context. - */ -void audit_log_end(struct audit_buffer *ab) -{ - if (!ab) - return; - if (!audit_rate_check()) { - audit_log_lost("rate limit exceeded"); - } else { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); - - if (audit_pid) { - skb_queue_tail(&audit_skb_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); - } else { - audit_printk_skb(ab->skb); - } - ab->skb = NULL; - } - audit_buffer_free(ab); -} - -/** - * audit_log - Log an audit record - * @ctx: audit context - * @gfp_mask: type of allocation - * @type: audit message type - * @fmt: format string to use - * @...: variable parameters matching the format string - * - * This is a convenience function that calls audit_log_start, - * audit_log_vformat, and audit_log_end. It may be called - * in any context. - */ -void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, - const char *fmt, ...) -{ - struct audit_buffer *ab; - va_list args; - - ab = audit_log_start(ctx, gfp_mask, type); - if (ab) { - va_start(args, fmt); - audit_log_vformat(ab, fmt, args); - va_end(args); - audit_log_end(ab); - } -} - -#ifdef CONFIG_SECURITY -/** - * audit_log_secctx - Converts and logs SELinux context - * @ab: audit_buffer - * @secid: security number - * - * This is a helper function that calls security_secid_to_secctx to convert - * secid to secctx and then adds the (converted) SELinux context to the audit - * log by calling audit_log_format, thus also preventing leak of internal secid - * to userspace. If secid cannot be converted audit_panic is called. - */ -void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ - u32 len; - char *secctx; - - if (security_secid_to_secctx(secid, &secctx, &len)) { - audit_panic("Cannot convert secid to context"); - } else { - audit_log_format(ab, " obj=%s", secctx); - security_release_secctx(secctx, len); - } -} -EXPORT_SYMBOL(audit_log_secctx); -#endif - -EXPORT_SYMBOL(audit_log_start); -EXPORT_SYMBOL(audit_log_end); -EXPORT_SYMBOL(audit_log_format); -EXPORT_SYMBOL(audit_log); -#include "audit.h" -#include -#include -#include -#include -#include - -struct audit_tree; -struct audit_chunk; - -struct audit_tree { - atomic_t count; - int goner; - struct audit_chunk *root; - struct list_head chunks; - struct list_head rules; - struct list_head list; - struct list_head same_root; - struct rcu_head head; - char pathname[]; -}; - -struct audit_chunk { - struct list_head hash; - struct fsnotify_mark mark; - struct list_head trees; /* with root here */ - int dead; - int count; - atomic_long_t refs; - struct rcu_head head; - struct node { - struct list_head list; - struct audit_tree *owner; - unsigned index; /* index; upper bit indicates 'will prune' */ - } owners[]; -}; - -static LIST_HEAD(tree_list); -static LIST_HEAD(prune_list); - -/* - * One struct chunk is attached to each inode of interest. - * We replace struct chunk on tagging/untagging. - * Rules have pointer to struct audit_tree. - * Rules have struct list_head rlist forming a list of rules over - * the same tree. - * References to struct chunk are collected at audit_inode{,_child}() - * time and used in AUDIT_TREE rule matching. - * These references are dropped at the same time we are calling - * audit_free_names(), etc. - * - * Cyclic lists galore: - * tree.chunks anchors chunk.owners[].list hash_lock - * tree.rules anchors rule.rlist audit_filter_mutex - * chunk.trees anchors tree.same_root hash_lock - * chunk.hash is a hash with middle bits of watch.inode as - * a hash function. RCU, hash_lock - * - * tree is refcounted; one reference for "some rules on rules_list refer to - * it", one for each chunk with pointer to it. - * - * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount - * of watch contributes 1 to .refs). - * - * node.index allows to get from node.list to containing chunk. - * MSB of that sucker is stolen to mark taggings that we might have to - * revert - several operations have very unpleasant cleanup logics and - * that makes a difference. Some. - */ - -static struct fsnotify_group *audit_tree_group; - -static struct audit_tree *alloc_tree(const char *s) -{ - struct audit_tree *tree; - - tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); - if (tree) { - atomic_set(&tree->count, 1); - tree->goner = 0; - INIT_LIST_HEAD(&tree->chunks); - INIT_LIST_HEAD(&tree->rules); - INIT_LIST_HEAD(&tree->list); - INIT_LIST_HEAD(&tree->same_root); - tree->root = NULL; - strcpy(tree->pathname, s); - } - return tree; -} - -static inline void get_tree(struct audit_tree *tree) -{ - atomic_inc(&tree->count); -} - -static inline void put_tree(struct audit_tree *tree) -{ - if (atomic_dec_and_test(&tree->count)) - kfree_rcu(tree, head); -} - -/* to avoid bringing the entire thing in audit.h */ -const char *audit_tree_path(struct audit_tree *tree) -{ - return tree->pathname; -} - -static void free_chunk(struct audit_chunk *chunk) -{ - int i; - - for (i = 0; i < chunk->count; i++) { - if (chunk->owners[i].owner) - put_tree(chunk->owners[i].owner); - } - kfree(chunk); -} - -void audit_put_chunk(struct audit_chunk *chunk) -{ - if (atomic_long_dec_and_test(&chunk->refs)) - free_chunk(chunk); -} - -static void __put_chunk(struct rcu_head *rcu) -{ - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); - audit_put_chunk(chunk); -} - -static void audit_tree_destroy_watch(struct fsnotify_mark *entry) -{ - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - call_rcu(&chunk->head, __put_chunk); -} - -static struct audit_chunk *alloc_chunk(int count) -{ - struct audit_chunk *chunk; - size_t size; - int i; - - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); - if (!chunk) - return NULL; - - INIT_LIST_HEAD(&chunk->hash); - INIT_LIST_HEAD(&chunk->trees); - chunk->count = count; - atomic_long_set(&chunk->refs, 1); - for (i = 0; i < count; i++) { - INIT_LIST_HEAD(&chunk->owners[i].list); - chunk->owners[i].index = i; - } - fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); - return chunk; -} - -enum {HASH_SIZE = 128}; -static struct list_head chunk_hash_heads[HASH_SIZE]; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); - -static inline struct list_head *chunk_hash(const struct inode *inode) -{ - unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; - return chunk_hash_heads + n % HASH_SIZE; -} - -/* hash_lock & entry->lock is held by caller */ -static void insert_hash(struct audit_chunk *chunk) -{ - struct fsnotify_mark *entry = &chunk->mark; - struct list_head *list; - - if (!entry->i.inode) - return; - list = chunk_hash(entry->i.inode); - list_add_rcu(&chunk->hash, list); -} - -/* called under rcu_read_lock */ -struct audit_chunk *audit_tree_lookup(const struct inode *inode) -{ - struct list_head *list = chunk_hash(inode); - struct audit_chunk *p; - - list_for_each_entry_rcu(p, list, hash) { - /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { - atomic_long_inc(&p->refs); - return p; - } - } - return NULL; -} - -int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) -{ - int n; - for (n = 0; n < chunk->count; n++) - if (chunk->owners[n].owner == tree) - return 1; - return 0; -} - -/* tagging and untagging inodes with trees */ - -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - -static void untag_chunk(struct node *p) -{ - struct audit_chunk *chunk = find_chunk(p); - struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new = NULL; - struct audit_tree *owner; - int size = chunk->count - 1; - int i, j; - - fsnotify_get_mark(entry); - - spin_unlock(&hash_lock); - - if (size) - new = alloc_chunk(size); - - spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { - spin_unlock(&entry->lock); - if (new) - free_chunk(new); - goto out; - } - - owner = p->owner; - - if (!size) { - chunk->dead = 1; - spin_lock(&hash_lock); - list_del_init(&chunk->trees); - if (owner->root == chunk) - owner->root = NULL; - list_del_init(&p->list); - list_del_rcu(&chunk->hash); - spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); - goto out; - } - - if (!new) - goto Fallback; - - fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); - goto Fallback; - } - - chunk->dead = 1; - spin_lock(&hash_lock); - list_replace_init(&chunk->trees, &new->trees); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - - for (i = j = 0; j <= size; i++, j++) { - struct audit_tree *s; - if (&chunk->owners[j] == p) { - list_del_init(&p->list); - i--; - continue; - } - s = chunk->owners[j].owner; - new->owners[i].owner = s; - new->owners[i].index = chunk->owners[j].index - j + i; - if (!s) /* result of earlier fallback */ - continue; - get_tree(s); - list_replace_init(&chunk->owners[j].list, &new->owners[i].list); - } - - list_replace_rcu(&chunk->hash, &new->hash); - list_for_each_entry(owner, &new->trees, same_root) - owner->root = new; - spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); - goto out; - -Fallback: - // do the best we can - spin_lock(&hash_lock); - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - spin_unlock(&hash_lock); - spin_unlock(&entry->lock); -out: - fsnotify_put_mark(entry); - spin_lock(&hash_lock); -} - -static int create_chunk(struct inode *inode, struct audit_tree *tree) -{ - struct fsnotify_mark *entry; - struct audit_chunk *chunk = alloc_chunk(1); - if (!chunk) - return -ENOMEM; - - entry = &chunk->mark; - if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); - return -ENOSPC; - } - - spin_lock(&entry->lock); - spin_lock(&hash_lock); - if (tree->goner) { - spin_unlock(&hash_lock); - chunk->dead = 1; - spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); - return 0; - } - chunk->owners[0].index = (1U << 31); - chunk->owners[0].owner = tree; - get_tree(tree); - list_add(&chunk->owners[0].list, &tree->chunks); - if (!tree->root) { - tree->root = chunk; - list_add(&tree->same_root, &chunk->trees); - } - insert_hash(chunk); - spin_unlock(&hash_lock); - spin_unlock(&entry->lock); - return 0; -} - -/* the first tagged inode becomes root of tree */ -static int tag_chunk(struct inode *inode, struct audit_tree *tree) -{ - struct fsnotify_mark *old_entry, *chunk_entry; - struct audit_tree *owner; - struct audit_chunk *chunk, *old; - struct node *p; - int n; - - old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); - if (!old_entry) - return create_chunk(inode, tree); - - old = container_of(old_entry, struct audit_chunk, mark); - - /* are we already there? */ - spin_lock(&hash_lock); - for (n = 0; n < old->count; n++) { - if (old->owners[n].owner == tree) { - spin_unlock(&hash_lock); - fsnotify_put_mark(old_entry); - return 0; - } - } - spin_unlock(&hash_lock); - - chunk = alloc_chunk(old->count + 1); - if (!chunk) { - fsnotify_put_mark(old_entry); - return -ENOMEM; - } - - chunk_entry = &chunk->mark; - - spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { - /* old_entry is being shot, lets just lie */ - spin_unlock(&old_entry->lock); - fsnotify_put_mark(old_entry); - free_chunk(chunk); - return -ENOENT; - } - - fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { - spin_unlock(&old_entry->lock); - free_chunk(chunk); - fsnotify_put_mark(old_entry); - return -ENOSPC; - } - - /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ - spin_lock(&chunk_entry->lock); - spin_lock(&hash_lock); - - /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ - if (tree->goner) { - spin_unlock(&hash_lock); - chunk->dead = 1; - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); - - fsnotify_destroy_mark(chunk_entry); - - fsnotify_put_mark(chunk_entry); - fsnotify_put_mark(old_entry); - return 0; - } - list_replace_init(&old->trees, &chunk->trees); - for (n = 0, p = chunk->owners; n < old->count; n++, p++) { - struct audit_tree *s = old->owners[n].owner; - p->owner = s; - p->index = old->owners[n].index; - if (!s) /* result of fallback in untag */ - continue; - get_tree(s); - list_replace_init(&old->owners[n].list, &p->list); - } - p->index = (chunk->count - 1) | (1U<<31); - p->owner = tree; - get_tree(tree); - list_add(&p->list, &tree->chunks); - list_replace_rcu(&old->hash, &chunk->hash); - list_for_each_entry(owner, &chunk->trees, same_root) - owner->root = chunk; - old->dead = 1; - if (!tree->root) { - tree->root = chunk; - list_add(&tree->same_root, &chunk->trees); - } - spin_unlock(&hash_lock); - spin_unlock(&chunk_entry->lock); - spin_unlock(&old_entry->lock); - fsnotify_destroy_mark(old_entry); - fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ - return 0; -} - -static void kill_rules(struct audit_tree *tree) -{ - struct audit_krule *rule, *next; - struct audit_entry *entry; - struct audit_buffer *ab; - - list_for_each_entry_safe(rule, next, &tree->rules, rlist) { - entry = container_of(rule, struct audit_entry, rule); - - list_del_init(&rule->rlist); - if (rule->tree) { - /* not a half-baked one */ - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); - audit_log_format(ab, " dir="); - audit_log_untrustedstring(ab, rule->tree->pathname); - audit_log_key(ab, rule->filterkey); - audit_log_format(ab, " list=%d res=1", rule->listnr); - audit_log_end(ab); - rule->tree = NULL; - list_del_rcu(&entry->list); - list_del(&entry->rule.list); - call_rcu(&entry->rcu, audit_free_rule_rcu); - } - } -} - -/* - * finish killing struct audit_tree - */ -static void prune_one(struct audit_tree *victim) -{ - spin_lock(&hash_lock); - while (!list_empty(&victim->chunks)) { - struct node *p; - - p = list_entry(victim->chunks.next, struct node, list); - - untag_chunk(p); - } - spin_unlock(&hash_lock); - put_tree(victim); -} - -/* trim the uncommitted chunks from tree */ - -static void trim_marked(struct audit_tree *tree) -{ - struct list_head *p, *q; - spin_lock(&hash_lock); - if (tree->goner) { - spin_unlock(&hash_lock); - return; - } - /* reorder */ - for (p = tree->chunks.next; p != &tree->chunks; p = q) { - struct node *node = list_entry(p, struct node, list); - q = p->next; - if (node->index & (1U<<31)) { - list_del_init(p); - list_add(p, &tree->chunks); - } - } - - while (!list_empty(&tree->chunks)) { - struct node *node; - - node = list_entry(tree->chunks.next, struct node, list); - - /* have we run out of marked? */ - if (!(node->index & (1U<<31))) - break; - - untag_chunk(node); - } - if (!tree->root && !tree->goner) { - tree->goner = 1; - spin_unlock(&hash_lock); - mutex_lock(&audit_filter_mutex); - kill_rules(tree); - list_del_init(&tree->list); - mutex_unlock(&audit_filter_mutex); - prune_one(tree); - } else { - spin_unlock(&hash_lock); - } -} - -static void audit_schedule_prune(void); - -/* called with audit_filter_mutex */ -int audit_remove_tree_rule(struct audit_krule *rule) -{ - struct audit_tree *tree; - tree = rule->tree; - if (tree) { - spin_lock(&hash_lock); - list_del_init(&rule->rlist); - if (list_empty(&tree->rules) && !tree->goner) { - tree->root = NULL; - list_del_init(&tree->same_root); - tree->goner = 1; - list_move(&tree->list, &prune_list); - rule->tree = NULL; - spin_unlock(&hash_lock); - audit_schedule_prune(); - return 1; - } - rule->tree = NULL; - spin_unlock(&hash_lock); - return 1; - } - return 0; -} - -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return mnt->mnt_root->d_inode == arg; -} - -void audit_trim_trees(void) -{ - struct list_head cursor; - - mutex_lock(&audit_filter_mutex); - list_add(&cursor, &tree_list); - while (cursor.next != &tree_list) { - struct audit_tree *tree; - struct path path; - struct vfsmount *root_mnt; - struct node *node; - int err; - - tree = container_of(cursor.next, struct audit_tree, list); - get_tree(tree); - list_del(&cursor); - list_add(&cursor, &tree->list); - mutex_unlock(&audit_filter_mutex); - - err = kern_path(tree->pathname, 0, &path); - if (err) - goto skip_it; - - root_mnt = collect_mounts(&path); - path_put(&path); - if (!root_mnt) - goto skip_it; - - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) { - struct audit_chunk *chunk = find_chunk(node); - /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; - node->index |= 1U<<31; - if (iterate_mounts(compare_root, inode, root_mnt)) - node->index &= ~(1U<<31); - } - spin_unlock(&hash_lock); - trim_marked(tree); - put_tree(tree); - drop_collected_mounts(root_mnt); -skip_it: - mutex_lock(&audit_filter_mutex); - } - list_del(&cursor); - mutex_unlock(&audit_filter_mutex); -} - -int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) -{ - - if (pathname[0] != '/' || - rule->listnr != AUDIT_FILTER_EXIT || - op != Audit_equal || - rule->inode_f || rule->watch || rule->tree) - return -EINVAL; - rule->tree = alloc_tree(pathname); - if (!rule->tree) - return -ENOMEM; - return 0; -} - -void audit_put_tree(struct audit_tree *tree) -{ - put_tree(tree); -} - -static int tag_mount(struct vfsmount *mnt, void *arg) -{ - return tag_chunk(mnt->mnt_root->d_inode, arg); -} - -/* called with audit_filter_mutex */ -int audit_add_tree_rule(struct audit_krule *rule) -{ - struct audit_tree *seed = rule->tree, *tree; - struct path path; - struct vfsmount *mnt; - int err; - - list_for_each_entry(tree, &tree_list, list) { - if (!strcmp(seed->pathname, tree->pathname)) { - put_tree(seed); - rule->tree = tree; - list_add(&rule->rlist, &tree->rules); - return 0; - } - } - tree = seed; - list_add(&tree->list, &tree_list); - list_add(&rule->rlist, &tree->rules); - /* do not set rule->tree yet */ - mutex_unlock(&audit_filter_mutex); - - err = kern_path(tree->pathname, 0, &path); - if (err) - goto Err; - mnt = collect_mounts(&path); - path_put(&path); - if (!mnt) { - err = -ENOMEM; - goto Err; - } - - get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); - - if (!err) { - struct node *node; - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) - node->index &= ~(1U<<31); - spin_unlock(&hash_lock); - } else { - trim_marked(tree); - goto Err; - } - - mutex_lock(&audit_filter_mutex); - if (list_empty(&rule->rlist)) { - put_tree(tree); - return -ENOENT; - } - rule->tree = tree; - put_tree(tree); - - return 0; -Err: - mutex_lock(&audit_filter_mutex); - list_del_init(&tree->list); - list_del_init(&tree->rules); - put_tree(tree); - return err; -} - -int audit_tag_tree(char *old, char *new) -{ - struct list_head cursor, barrier; - int failed = 0; - struct path path1, path2; - struct vfsmount *tagged; - int err; - - err = kern_path(new, 0, &path2); - if (err) - return err; - tagged = collect_mounts(&path2); - path_put(&path2); - if (!tagged) - return -ENOMEM; - - err = kern_path(old, 0, &path1); - if (err) { - drop_collected_mounts(tagged); - return err; - } - - mutex_lock(&audit_filter_mutex); - list_add(&barrier, &tree_list); - list_add(&cursor, &barrier); - - while (cursor.next != &tree_list) { - struct audit_tree *tree; - int good_one = 0; - - tree = container_of(cursor.next, struct audit_tree, list); - get_tree(tree); - list_del(&cursor); - list_add(&cursor, &tree->list); - mutex_unlock(&audit_filter_mutex); - - err = kern_path(tree->pathname, 0, &path2); - if (!err) { - good_one = path_is_under(&path1, &path2); - path_put(&path2); - } - - if (!good_one) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; - } - - failed = iterate_mounts(tag_mount, tree, tagged); - if (failed) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - break; - } - - mutex_lock(&audit_filter_mutex); - spin_lock(&hash_lock); - if (!tree->goner) { - list_del(&tree->list); - list_add(&tree->list, &tree_list); - } - spin_unlock(&hash_lock); - put_tree(tree); - } - - while (barrier.prev != &tree_list) { - struct audit_tree *tree; - - tree = container_of(barrier.prev, struct audit_tree, list); - get_tree(tree); - list_del(&tree->list); - list_add(&tree->list, &barrier); - mutex_unlock(&audit_filter_mutex); - - if (!failed) { - struct node *node; - spin_lock(&hash_lock); - list_for_each_entry(node, &tree->chunks, list) - node->index &= ~(1U<<31); - spin_unlock(&hash_lock); - } else { - trim_marked(tree); - } - - put_tree(tree); - mutex_lock(&audit_filter_mutex); - } - list_del(&barrier); - list_del(&cursor); - mutex_unlock(&audit_filter_mutex); - path_put(&path1); - drop_collected_mounts(tagged); - return failed; -} - -/* - * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread. - */ -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - mutex_lock(&audit_filter_mutex); - - while (!list_empty(&prune_list)) { - struct audit_tree *victim; - - victim = list_entry(prune_list.next, struct audit_tree, list); - list_del_init(&victim->list); - - mutex_unlock(&audit_filter_mutex); - - prune_one(victim); - - mutex_lock(&audit_filter_mutex); - } - - mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); - return 0; -} - -static void audit_schedule_prune(void) -{ - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); -} - -/* - * ... and that one is done if evict_chunk() decides to delay until the end - * of syscall. Runs synchronously. - */ -void audit_kill_trees(struct list_head *list) -{ - mutex_lock(&audit_cmd_mutex); - mutex_lock(&audit_filter_mutex); - - while (!list_empty(list)) { - struct audit_tree *victim; - - victim = list_entry(list->next, struct audit_tree, list); - kill_rules(victim); - list_del_init(&victim->list); - - mutex_unlock(&audit_filter_mutex); - - prune_one(victim); - - mutex_lock(&audit_filter_mutex); - } - - mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); -} - -/* - * Here comes the stuff asynchronous to auditctl operations - */ - -static void evict_chunk(struct audit_chunk *chunk) -{ - struct audit_tree *owner; - struct list_head *postponed = audit_killed_trees(); - int need_prune = 0; - int n; - - if (chunk->dead) - return; - - chunk->dead = 1; - mutex_lock(&audit_filter_mutex); - spin_lock(&hash_lock); - while (!list_empty(&chunk->trees)) { - owner = list_entry(chunk->trees.next, - struct audit_tree, same_root); - owner->goner = 1; - owner->root = NULL; - list_del_init(&owner->same_root); - spin_unlock(&hash_lock); - if (!postponed) { - kill_rules(owner); - list_move(&owner->list, &prune_list); - need_prune = 1; - } else { - list_move(&owner->list, postponed); - } - spin_lock(&hash_lock); - } - list_del_rcu(&chunk->hash); - for (n = 0; n < chunk->count; n++) - list_del_init(&chunk->owners[n].list); - spin_unlock(&hash_lock); - if (need_prune) - audit_schedule_prune(); - mutex_unlock(&audit_filter_mutex); -} - -static int audit_tree_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) -{ - BUG(); - return -EOPNOTSUPP; -} - -static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) -{ - struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - - evict_chunk(chunk); - fsnotify_put_mark(entry); -} - -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - -static const struct fsnotify_ops audit_tree_ops = { - .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, - .free_group_priv = NULL, - .free_event_priv = NULL, - .freeing_mark = audit_tree_freeing_mark, -}; - -static int __init audit_tree_init(void) -{ - int i; - - audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); - if (IS_ERR(audit_tree_group)) - audit_panic("cannot initialize fsnotify group for rectree watches"); - - for (i = 0; i < HASH_SIZE; i++) - INIT_LIST_HEAD(&chunk_hash_heads[i]); - - return 0; -} -__initcall(audit_tree_init); -/* audit_watch.c -- watching inodes - * - * Copyright 2003-2009 Red Hat, Inc. - * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "audit.h" - -/* - * Reference counting: - * - * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED - * event. Each audit_watch holds a reference to its associated parent. - * - * audit_watch: if added to lists, lifetime is from audit_init_watch() to - * audit_remove_watch(). Additionally, an audit_watch may exist - * temporarily to assist in searching existing filter data. Each - * audit_krule holds a reference to its associated watch. - */ - -struct audit_watch { - atomic_t count; /* reference count */ - dev_t dev; /* associated superblock device */ - char *path; /* insertion path */ - unsigned long ino; /* associated inode number */ - struct audit_parent *parent; /* associated parent */ - struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* anchor for krule->rlist */ -}; - -struct audit_parent { - struct list_head watches; /* anchor for audit_watch->wlist */ - struct fsnotify_mark mark; /* fsnotify mark on the inode */ -}; - -/* fsnotify handle. */ -static struct fsnotify_group *audit_watch_group; - -/* fsnotify events we care about. */ -#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) - -static void audit_free_parent(struct audit_parent *parent) -{ - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); -} - -static void audit_watch_free_mark(struct fsnotify_mark *entry) -{ - struct audit_parent *parent; - - parent = container_of(entry, struct audit_parent, mark); - audit_free_parent(parent); -} - -static void audit_get_parent(struct audit_parent *parent) -{ - if (likely(parent)) - fsnotify_get_mark(&parent->mark); -} - -static void audit_put_parent(struct audit_parent *parent) -{ - if (likely(parent)) - fsnotify_put_mark(&parent->mark); -} - -/* - * Find and return the audit_parent on the given inode. If found a reference - * is taken on this parent. - */ -static inline struct audit_parent *audit_find_parent(struct inode *inode) -{ - struct audit_parent *parent = NULL; - struct fsnotify_mark *entry; - - entry = fsnotify_find_inode_mark(audit_watch_group, inode); - if (entry) - parent = container_of(entry, struct audit_parent, mark); - - return parent; -} - -void audit_get_watch(struct audit_watch *watch) -{ - atomic_inc(&watch->count); -} - -void audit_put_watch(struct audit_watch *watch) -{ - if (atomic_dec_and_test(&watch->count)) { - WARN_ON(watch->parent); - WARN_ON(!list_empty(&watch->rules)); - kfree(watch->path); - kfree(watch); - } -} - -static void audit_remove_watch(struct audit_watch *watch) -{ - list_del(&watch->wlist); - audit_put_parent(watch->parent); - watch->parent = NULL; - audit_put_watch(watch); /* match initial get */ -} - -char *audit_watch_path(struct audit_watch *watch) -{ - return watch->path; -} - -int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) -{ - return (watch->ino != (unsigned long)-1) && - (watch->ino == ino) && - (watch->dev == dev); -} - -/* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct path *path) -{ - struct inode *inode = path->dentry->d_inode; - struct audit_parent *parent; - int ret; - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (unlikely(!parent)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&parent->watches); - - fsnotify_init_mark(&parent->mark, audit_watch_free_mark); - parent->mark.mask = AUDIT_FS_WATCH; - ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); - if (ret < 0) { - audit_free_parent(parent); - return ERR_PTR(ret); - } - - return parent; -} - -/* Initialize a watch entry. */ -static struct audit_watch *audit_init_watch(char *path) -{ - struct audit_watch *watch; - - watch = kzalloc(sizeof(*watch), GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); - watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; - - return watch; -} - -/* Translate a watch string to kernel respresentation. */ -int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) -{ - struct audit_watch *watch; - - if (!audit_watch_group) - return -EOPNOTSUPP; - - if (path[0] != '/' || path[len-1] == '/' || - krule->listnr != AUDIT_FILTER_EXIT || - op != Audit_equal || - krule->inode_f || krule->watch || krule->tree) - return -EINVAL; - - watch = audit_init_watch(path); - if (IS_ERR(watch)) - return PTR_ERR(watch); - - audit_get_watch(watch); - krule->watch = watch; - - return 0; -} - -/* Duplicate the given audit watch. The new watch's rules list is initialized - * to an empty list and wlist is undefined. */ -static struct audit_watch *audit_dupe_watch(struct audit_watch *old) -{ - char *path; - struct audit_watch *new; - - path = kstrdup(old->path, GFP_KERNEL); - if (unlikely(!path)) - return ERR_PTR(-ENOMEM); - - new = audit_init_watch(path); - if (IS_ERR(new)) { - kfree(path); - goto out; - } - - new->dev = old->dev; - new->ino = old->ino; - audit_get_parent(old->parent); - new->parent = old->parent; - -out: - return new; -} - -static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) -{ - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "auid=%u ses=%u op=", - audit_get_loginuid(current), - audit_get_sessionid(current)); - audit_log_string(ab, op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - audit_log_key(ab, r->filterkey); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } -} - -/* Update inode info in audit rules based on filesystem event. */ -static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, - unsigned long ino, unsigned invalidating) -{ - struct audit_watch *owatch, *nwatch, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *oentry, *nentry; - - mutex_lock(&audit_filter_mutex); - /* Run all of the watches on this parent looking for the one that - * matches the given dname */ - list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) - continue; - - /* If the update involves invalidating rules, do the inode-based - * filtering now, so we don't omit records. */ - if (invalidating && !audit_dummy_context()) - audit_filter_inodes(current, current->audit_context); - - /* updating ino will likely change which audit_hash_list we - * are on so we need a new watch for the new list */ - nwatch = audit_dupe_watch(owatch); - if (IS_ERR(nwatch)) { - mutex_unlock(&audit_filter_mutex); - audit_panic("error updating watch, skipping"); - return; - } - nwatch->dev = dev; - nwatch->ino = ino; - - list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { - - oentry = container_of(r, struct audit_entry, rule); - list_del(&oentry->rule.rlist); - list_del_rcu(&oentry->list); - - nentry = audit_dupe_rule(&oentry->rule); - if (IS_ERR(nentry)) { - list_del(&oentry->rule.list); - audit_panic("error updating watch, removing"); - } else { - int h = audit_hash_ino((u32)ino); - - /* - * nentry->rule.watch == oentry->rule.watch so - * we must drop that reference and set it to our - * new watch. - */ - audit_put_watch(nentry->rule.watch); - audit_get_watch(nwatch); - nentry->rule.watch = nwatch; - list_add(&nentry->rule.rlist, &nwatch->rules); - list_add_rcu(&nentry->list, &audit_inode_hash[h]); - list_replace(&oentry->rule.list, - &nentry->rule.list); - } - - audit_watch_log_rule_change(r, owatch, "updated rules"); - - call_rcu(&oentry->rcu, audit_free_rule_rcu); - } - - audit_remove_watch(owatch); - goto add_watch_to_parent; /* event applies to a single watch */ - } - mutex_unlock(&audit_filter_mutex); - return; - -add_watch_to_parent: - list_add(&nwatch->wlist, &parent->watches); - mutex_unlock(&audit_filter_mutex); - return; -} - -/* Remove all watches & rules associated with a parent that is going away. */ -static void audit_remove_parent_watches(struct audit_parent *parent) -{ - struct audit_watch *w, *nextw; - struct audit_krule *r, *nextr; - struct audit_entry *e; - - mutex_lock(&audit_filter_mutex); - list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { - list_for_each_entry_safe(r, nextr, &w->rules, rlist) { - e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); - list_del(&r->rlist); - list_del(&r->list); - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - } - audit_remove_watch(w); - } - mutex_unlock(&audit_filter_mutex); - - fsnotify_destroy_mark(&parent->mark); -} - -/* Get path information necessary for adding watches. */ -static int audit_get_nd(struct audit_watch *watch, struct path *parent) -{ - struct nameidata nd; - struct dentry *d; - int err; - - err = kern_path_parent(watch->path, &nd); - if (err) - return err; - - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - return -EINVAL; - } - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); - if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - return PTR_ERR(d); - } - if (d->d_inode) { - /* update watch filter fields */ - watch->dev = d->d_inode->i_sb->s_dev; - watch->ino = d->d_inode->i_ino; - } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - - *parent = nd.path; - dput(d); - return 0; -} - -/* Associate the given rule with an existing parent. - * Caller must hold audit_filter_mutex. */ -static void audit_add_to_parent(struct audit_krule *krule, - struct audit_parent *parent) -{ - struct audit_watch *w, *watch = krule->watch; - int watch_found = 0; - - BUG_ON(!mutex_is_locked(&audit_filter_mutex)); - - list_for_each_entry(w, &parent->watches, wlist) { - if (strcmp(watch->path, w->path)) - continue; - - watch_found = 1; - - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); - audit_put_watch(watch); - - audit_get_watch(w); - krule->watch = watch = w; - break; - } - - if (!watch_found) { - audit_get_parent(parent); - watch->parent = parent; - - list_add(&watch->wlist, &parent->watches); - } - list_add(&krule->rlist, &watch->rules); -} - -/* Find a matching watch entry, or add this one. - * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule, struct list_head **list) -{ - struct audit_watch *watch = krule->watch; - struct audit_parent *parent; - struct path parent_path; - int h, ret = 0; - - mutex_unlock(&audit_filter_mutex); - - /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch, &parent_path); - - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - - if (ret) - return ret; - - /* either find an old parent or attach a new one */ - parent = audit_find_parent(parent_path.dentry->d_inode); - if (!parent) { - parent = audit_init_parent(&parent_path); - if (IS_ERR(parent)) { - ret = PTR_ERR(parent); - goto error; - } - } - - audit_add_to_parent(krule, parent); - - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - - h = audit_hash_ino((u32)watch->ino); - *list = &audit_inode_hash[h]; -error: - path_put(&parent_path); - return ret; -} - -void audit_remove_watch_rule(struct audit_krule *krule) -{ - struct audit_watch *watch = krule->watch; - struct audit_parent *parent = watch->parent; - - list_del(&krule->rlist); - - if (list_empty(&watch->rules)) { - audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); - fsnotify_destroy_mark(&parent->mark); - audit_put_parent(parent); - } - } -} - -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - -/* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) -{ - struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; - struct audit_parent *parent; - - parent = container_of(inode_mark, struct audit_parent, mark); - - BUG_ON(group != audit_watch_group); - - switch (event->data_type) { - case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; - break; - case (FSNOTIFY_EVENT_INODE): - inode = event->inode; - break; - default: - BUG(); - inode = NULL; - break; - }; - - if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); - else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) - audit_remove_parent_watches(parent); - - return 0; -} - -static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, - .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, -}; - -static int __init audit_watch_init(void) -{ - audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); - if (IS_ERR(audit_watch_group)) { - audit_watch_group = NULL; - audit_panic("cannot create audit fsnotify group"); - } - return 0; -} -device_initcall(audit_watch_init); -/* auditfilter.c -- filtering of audit events - * - * Copyright 2003-2004 Red Hat, Inc. - * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "audit.h" - -/* - * Locking model: - * - * audit_filter_mutex: - * Synchronizes writes and blocking reads of audit's filterlist - * data. Rcu is used to traverse the filterlist and access - * contents of structs audit_entry, audit_watch and opaque - * LSM rules during filtering. If modified, these structures - * must be copied and replace their counterparts in the filterlist. - * An audit_parent struct is not accessed during filtering, so may - * be written directly provided audit_filter_mutex is held. - */ - -/* Audit filter lists, defined in */ -struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { - LIST_HEAD_INIT(audit_filter_list[0]), - LIST_HEAD_INIT(audit_filter_list[1]), - LIST_HEAD_INIT(audit_filter_list[2]), - LIST_HEAD_INIT(audit_filter_list[3]), - LIST_HEAD_INIT(audit_filter_list[4]), - LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 -#error Fix audit_filter_list initialiser -#endif -}; -static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { - LIST_HEAD_INIT(audit_rules_list[0]), - LIST_HEAD_INIT(audit_rules_list[1]), - LIST_HEAD_INIT(audit_rules_list[2]), - LIST_HEAD_INIT(audit_rules_list[3]), - LIST_HEAD_INIT(audit_rules_list[4]), - LIST_HEAD_INIT(audit_rules_list[5]), -}; - -DEFINE_MUTEX(audit_filter_mutex); - -static inline void audit_free_rule(struct audit_entry *e) -{ - int i; - struct audit_krule *erule = &e->rule; - - /* some rules don't have associated watches */ - if (erule->watch) - audit_put_watch(erule->watch); - if (erule->fields) - for (i = 0; i < erule->field_count; i++) { - struct audit_field *f = &erule->fields[i]; - kfree(f->lsm_str); - security_audit_rule_free(f->lsm_rule); - } - kfree(erule->fields); - kfree(erule->filterkey); - kfree(e); -} - -void audit_free_rule_rcu(struct rcu_head *head) -{ - struct audit_entry *e = container_of(head, struct audit_entry, rcu); - audit_free_rule(e); -} - -/* Initialize an audit filterlist entry. */ -static inline struct audit_entry *audit_init_entry(u32 field_count) -{ - struct audit_entry *entry; - struct audit_field *fields; - - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (unlikely(!entry)) - return NULL; - - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); - if (unlikely(!fields)) { - kfree(entry); - return NULL; - } - entry->rule.fields = fields; - - return entry; -} - -/* Unpack a filter field's string representation from user-space - * buffer. */ -char *audit_unpack_string(void **bufp, size_t *remain, size_t len) -{ - char *str; - - if (!*bufp || (len == 0) || (len > *remain)) - return ERR_PTR(-EINVAL); - - /* Of the currently implemented string fields, PATH_MAX - * defines the longest valid length. - */ - if (len > PATH_MAX) - return ERR_PTR(-ENAMETOOLONG); - - str = kmalloc(len + 1, GFP_KERNEL); - if (unlikely(!str)) - return ERR_PTR(-ENOMEM); - - memcpy(str, *bufp, len); - str[len] = 0; - *bufp += len; - *remain -= len; - - return str; -} - -/* Translate an inode field to kernel respresentation. */ -static inline int audit_to_inode(struct audit_krule *krule, - struct audit_field *f) -{ - if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree || - (f->op != Audit_equal && f->op != Audit_not_equal)) - return -EINVAL; - - krule->inode_f = f; - return 0; -} - -static __u32 *classes[AUDIT_SYSCALL_CLASSES]; - -int __init audit_register_class(int class, unsigned *list) -{ - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); - if (!p) - return -ENOMEM; - while (*list != ~0U) { - unsigned n = *list++; - if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { - kfree(p); - return -EINVAL; - } - p[AUDIT_WORD(n)] |= AUDIT_BIT(n); - } - if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { - kfree(p); - return -EINVAL; - } - classes[class] = p; - return 0; -} - -int audit_match_class(int class, unsigned syscall) -{ - if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) - return 0; - if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) - return 0; - return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); -} - -#ifdef CONFIG_AUDITSYSCALL -static inline int audit_match_class_bits(int class, u32 *mask) -{ - int i; - - if (classes[class]) { - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (mask[i] & classes[class][i]) - return 0; - } - return 1; -} - -static int audit_match_signal(struct audit_entry *entry) -{ - struct audit_field *arch = entry->rule.arch_f; - - if (!arch) { - /* When arch is unspecified, we must check both masks on biarch - * as syscall number alone is ambiguous. */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, - entry->rule.mask) && - audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, - entry->rule.mask)); - } - - switch(audit_classify_arch(arch->val)) { - case 0: /* native */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, - entry->rule.mask)); - case 1: /* 32bit on biarch */ - return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, - entry->rule.mask)); - default: - return 1; - } -} -#endif - -/* Common user-space to kernel rule translation. */ -static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) -{ - unsigned listnr; - struct audit_entry *entry; - int i, err; - - err = -EINVAL; - listnr = rule->flags & ~AUDIT_FILTER_PREPEND; - switch(listnr) { - default: - goto exit_err; -#ifdef CONFIG_AUDITSYSCALL - case AUDIT_FILTER_ENTRY: - if (rule->action == AUDIT_ALWAYS) - goto exit_err; - case AUDIT_FILTER_EXIT: - case AUDIT_FILTER_TASK: -#endif - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: - ; - } - if (unlikely(rule->action == AUDIT_POSSIBLE)) { - printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); - goto exit_err; - } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) - goto exit_err; - if (rule->field_count > AUDIT_MAX_FIELDS) - goto exit_err; - - err = -ENOMEM; - entry = audit_init_entry(rule->field_count); - if (!entry) - goto exit_err; - - entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; - entry->rule.listnr = listnr; - entry->rule.action = rule->action; - entry->rule.field_count = rule->field_count; - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - entry->rule.mask[i] = rule->mask[i]; - - for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { - int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; - __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; - __u32 *class; - - if (!(*p & AUDIT_BIT(bit))) - continue; - *p &= ~AUDIT_BIT(bit); - class = classes[i]; - if (class) { - int j; - for (j = 0; j < AUDIT_BITMASK_SIZE; j++) - entry->rule.mask[j] |= class[j]; - } - } - - return entry; - -exit_err: - return ERR_PTR(err); -} - -static u32 audit_ops[] = -{ - [Audit_equal] = AUDIT_EQUAL, - [Audit_not_equal] = AUDIT_NOT_EQUAL, - [Audit_bitmask] = AUDIT_BIT_MASK, - [Audit_bittest] = AUDIT_BIT_TEST, - [Audit_lt] = AUDIT_LESS_THAN, - [Audit_gt] = AUDIT_GREATER_THAN, - [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, - [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, -}; - -static u32 audit_to_op(u32 op) -{ - u32 n; - for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) - ; - return n; -} - - -/* Translate struct audit_rule to kernel's rule respresentation. - * Exists for backward compatibility with userspace. */ -static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) -{ - struct audit_entry *entry; - int err = 0; - int i; - - entry = audit_to_entry_common(rule); - if (IS_ERR(entry)) - goto exit_nofree; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - u32 n; - - n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (n & AUDIT_NEGATE) - f->op = Audit_not_equal; - else if (!n) - f->op = Audit_equal; - else - f->op = audit_to_op(n); - - entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - - f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); - f->val = rule->values[i]; - - err = -EINVAL; - if (f->op == Audit_bad) - goto exit_free; - - switch(f->type) { - default: - goto exit_free; - case AUDIT_PID: - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - case AUDIT_LOGINUID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - /* bit ops are only useful on syscall args */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - /* arch is only allowed to be = or != */ - case AUDIT_ARCH: - if (f->op != Audit_not_equal && f->op != Audit_equal) - goto exit_free; - entry->rule.arch_f = f; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - } - } - - if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) - entry->rule.inode_f = NULL; - -exit_nofree: - return entry; - -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); -} - -/* Translate struct audit_rule_data to kernel's rule respresentation. */ -static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, - size_t datasz) -{ - int err = 0; - struct audit_entry *entry; - void *bufp; - size_t remain = datasz - sizeof(struct audit_rule_data); - int i; - char *str; - - entry = audit_to_entry_common((struct audit_rule *)data); - if (IS_ERR(entry)) - goto exit_nofree; - - bufp = data->buf; - entry->rule.vers_ops = 2; - for (i = 0; i < data->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - - err = -EINVAL; - - f->op = audit_to_op(data->fieldflags[i]); - if (f->op == Audit_bad) - goto exit_free; - - f->type = data->fields[i]; - f->val = data->values[i]; - f->lsm_str = NULL; - f->lsm_rule = NULL; - switch(f->type) { - case AUDIT_PID: - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - case AUDIT_LOGINUID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - case AUDIT_OBJ_UID: - case AUDIT_OBJ_GID: - break; - case AUDIT_ARCH: - entry->rule.arch_f = f; - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); - /* Keep currently invalid fields around in case they - * become valid after a policy reload. */ - if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM " - "\'%s\' is invalid\n", str); - err = 0; - } - if (err) { - kfree(str); - goto exit_free; - } else - f->lsm_str = str; - break; - case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); - if (err) { - kfree(str); - goto exit_free; - } - break; - case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - - err = audit_make_tree(&entry->rule, str, f->op); - kfree(str); - if (err) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) - goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) - goto exit_free; - entry->rule.buflen += f->val; - entry->rule.filterkey = str; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_FIELD_COMPARE: - if (f->val > AUDIT_MAX_FIELD_COMPARE) - goto exit_free; - break; - default: - goto exit_free; - } - } - - if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) - entry->rule.inode_f = NULL; - -exit_nofree: - return entry; - -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); -} - -/* Pack a filter field's string representation into data block. */ -static inline size_t audit_pack_string(void **bufp, const char *str) -{ - size_t len = strlen(str); - - memcpy(*bufp, str, len); - *bufp += len; - - return len; -} - -/* Translate kernel rule respresentation to struct audit_rule. - * Exists for backward compatibility with userspace. */ -static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) -{ - struct audit_rule *rule; - int i; - - rule = kzalloc(sizeof(*rule), GFP_KERNEL); - if (unlikely(!rule)) - return NULL; - - rule->flags = krule->flags | krule->listnr; - rule->action = krule->action; - rule->field_count = krule->field_count; - for (i = 0; i < rule->field_count; i++) { - rule->values[i] = krule->fields[i].val; - rule->fields[i] = krule->fields[i].type; - - if (krule->vers_ops == 1) { - if (krule->fields[i].op == Audit_not_equal) - rule->fields[i] |= AUDIT_NEGATE; - } else { - rule->fields[i] |= audit_ops[krule->fields[i].op]; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; - - return rule; -} - -/* Translate kernel rule respresentation to struct audit_rule_data. */ -static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) -{ - struct audit_rule_data *data; - void *bufp; - int i; - - data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); - if (unlikely(!data)) - return NULL; - memset(data, 0, sizeof(*data)); - - data->flags = krule->flags | krule->listnr; - data->action = krule->action; - data->field_count = krule->field_count; - bufp = data->buf; - for (i = 0; i < data->field_count; i++) { - struct audit_field *f = &krule->fields[i]; - - data->fields[i] = f->type; - data->fieldflags[i] = audit_ops[f->op]; - switch(f->type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - data->buflen += data->values[i] = - audit_pack_string(&bufp, f->lsm_str); - break; - case AUDIT_WATCH: - data->buflen += data->values[i] = - audit_pack_string(&bufp, - audit_watch_path(krule->watch)); - break; - case AUDIT_DIR: - data->buflen += data->values[i] = - audit_pack_string(&bufp, - audit_tree_path(krule->tree)); - break; - case AUDIT_FILTERKEY: - data->buflen += data->values[i] = - audit_pack_string(&bufp, krule->filterkey); - break; - default: - data->values[i] = f->val; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; - - return data; -} - -/* Compare two rules in kernel format. Considered success if rules - * don't match. */ -static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) -{ - int i; - - if (a->flags != b->flags || - a->listnr != b->listnr || - a->action != b->action || - a->field_count != b->field_count) - return 1; - - for (i = 0; i < a->field_count; i++) { - if (a->fields[i].type != b->fields[i].type || - a->fields[i].op != b->fields[i].op) - return 1; - - switch(a->fields[i].type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) - return 1; - break; - case AUDIT_WATCH: - if (strcmp(audit_watch_path(a->watch), - audit_watch_path(b->watch))) - return 1; - break; - case AUDIT_DIR: - if (strcmp(audit_tree_path(a->tree), - audit_tree_path(b->tree))) - return 1; - break; - case AUDIT_FILTERKEY: - /* both filterkeys exist based on above type compare */ - if (strcmp(a->filterkey, b->filterkey)) - return 1; - break; - default: - if (a->fields[i].val != b->fields[i].val) - return 1; - } - } - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (a->mask[i] != b->mask[i]) - return 1; - - return 0; -} - -/* Duplicate LSM field information. The lsm_rule is opaque, so must be - * re-initialized. */ -static inline int audit_dupe_lsm_field(struct audit_field *df, - struct audit_field *sf) -{ - int ret = 0; - char *lsm_str; - - /* our own copy of lsm_str */ - lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); - if (unlikely(!lsm_str)) - return -ENOMEM; - df->lsm_str = lsm_str; - - /* our own (refreshed) copy of lsm_rule */ - ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); - /* Keep currently invalid fields around in case they - * become valid after a policy reload. */ - if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM \'%s\' is " - "invalid\n", df->lsm_str); - ret = 0; - } - - return ret; -} - -/* Duplicate an audit rule. This will be a deep copy with the exception - * of the watch - that pointer is carried over. The LSM specific fields - * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. - * The rlist element is undefined; list manipulations are handled apart from - * the initial copy. */ -struct audit_entry *audit_dupe_rule(struct audit_krule *old) -{ - u32 fcount = old->field_count; - struct audit_entry *entry; - struct audit_krule *new; - char *fk; - int i, err = 0; - - entry = audit_init_entry(fcount); - if (unlikely(!entry)) - return ERR_PTR(-ENOMEM); - - new = &entry->rule; - new->vers_ops = old->vers_ops; - new->flags = old->flags; - new->listnr = old->listnr; - new->action = old->action; - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - new->mask[i] = old->mask[i]; - new->prio = old->prio; - new->buflen = old->buflen; - new->inode_f = old->inode_f; - new->field_count = old->field_count; - - /* - * note that we are OK with not refcounting here; audit_match_tree() - * never dereferences tree and we can't get false positives there - * since we'd have to have rule gone from the list *and* removed - * before the chunks found by lookup had been allocated, i.e. before - * the beginning of list scan. - */ - new->tree = old->tree; - memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); - - /* deep copy this information, updating the lsm_rule fields, because - * the originals will all be freed when the old rule is freed. */ - for (i = 0; i < fcount; i++) { - switch (new->fields[i].type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - err = audit_dupe_lsm_field(&new->fields[i], - &old->fields[i]); - break; - case AUDIT_FILTERKEY: - fk = kstrdup(old->filterkey, GFP_KERNEL); - if (unlikely(!fk)) - err = -ENOMEM; - else - new->filterkey = fk; - } - if (err) { - audit_free_rule(entry); - return ERR_PTR(err); - } - } - - if (old->watch) { - audit_get_watch(old->watch); - new->watch = old->watch; - } - - return entry; -} - -/* Find an existing audit rule. - * Caller must hold audit_filter_mutex to prevent stale rule data. */ -static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head **p) -{ - struct audit_entry *e, *found = NULL; - struct list_head *list; - int h; - - if (entry->rule.inode_f) { - h = audit_hash_ino(entry->rule.inode_f->val); - *p = list = &audit_inode_hash[h]; - } else if (entry->rule.watch) { - /* we don't know the inode number, so must walk entire hash */ - for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { - list = &audit_inode_hash[h]; - list_for_each_entry(e, list, list) - if (!audit_compare_rule(&entry->rule, &e->rule)) { - found = e; - goto out; - } - } - goto out; - } else { - *p = list = &audit_filter_list[entry->rule.listnr]; - } - - list_for_each_entry(e, list, list) - if (!audit_compare_rule(&entry->rule, &e->rule)) { - found = e; - goto out; - } - -out: - return found; -} - -static u64 prio_low = ~0ULL/2; -static u64 prio_high = ~0ULL/2 - 1; - -/* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry) -{ - struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; - struct audit_tree *tree = entry->rule.tree; - struct list_head *list; - int err; -#ifdef CONFIG_AUDITSYSCALL - int dont_count = 0; - - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) - dont_count = 1; -#endif - - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, &list); - if (e) { - mutex_unlock(&audit_filter_mutex); - err = -EEXIST; - /* normally audit_add_tree_rule() will free it on failure */ - if (tree) - audit_put_tree(tree); - goto error; - } - - if (watch) { - /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule, &list); - if (err) { - mutex_unlock(&audit_filter_mutex); - goto error; - } - } - if (tree) { - err = audit_add_tree_rule(&entry->rule); - if (err) { - mutex_unlock(&audit_filter_mutex); - goto error; - } - } - - entry->rule.prio = ~0ULL; - if (entry->rule.listnr == AUDIT_FILTER_EXIT) { - if (entry->rule.flags & AUDIT_FILTER_PREPEND) - entry->rule.prio = ++prio_high; - else - entry->rule.prio = --prio_low; - } - - if (entry->rule.flags & AUDIT_FILTER_PREPEND) { - list_add(&entry->rule.list, - &audit_rules_list[entry->rule.listnr]); - list_add_rcu(&entry->list, list); - entry->rule.flags &= ~AUDIT_FILTER_PREPEND; - } else { - list_add_tail(&entry->rule.list, - &audit_rules_list[entry->rule.listnr]); - list_add_tail_rcu(&entry->list, list); - } -#ifdef CONFIG_AUDITSYSCALL - if (!dont_count) - audit_n_rules++; - - if (!audit_match_signal(entry)) - audit_signals++; -#endif - mutex_unlock(&audit_filter_mutex); - - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ - return err; -} - -/* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) -{ - struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; - struct audit_tree *tree = entry->rule.tree; - struct list_head *list; - int ret = 0; -#ifdef CONFIG_AUDITSYSCALL - int dont_count = 0; - - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) - dont_count = 1; -#endif - - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, &list); - if (!e) { - mutex_unlock(&audit_filter_mutex); - ret = -ENOENT; - goto out; - } - - if (e->rule.watch) - audit_remove_watch_rule(&e->rule); - - if (e->rule.tree) - audit_remove_tree_rule(&e->rule); - - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); - -#ifdef CONFIG_AUDITSYSCALL - if (!dont_count) - audit_n_rules--; - - if (!audit_match_signal(entry)) - audit_signals--; -#endif - mutex_unlock(&audit_filter_mutex); - -out: - if (watch) - audit_put_watch(watch); /* match initial get */ - if (tree) - audit_put_tree(tree); /* that's the temporary one */ - - return ret; -} - -/* List rules using struct audit_rule. Exists for backward - * compatibility with userspace. */ -static void audit_list(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_krule *r; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; ibuflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); - if (skb) - skb_queue_tail(q, skb); -} - -/* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, - char *action, struct audit_krule *rule, - int res) -{ - struct audit_buffer *ab; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - if (!ab) - return; - audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " op="); - audit_log_string(ab, action); - audit_log_key(ab, rule->filterkey); - audit_log_format(ab, " list=%d res=%d", rule->listnr, res); - audit_log_end(ab); -} - -/** - * audit_receive_filter - apply all rules to the specified message type - * @type: audit message type - * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages - * @seq: netlink audit message sequence (serial) number - * @data: payload data - * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender - */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) -{ - struct task_struct *tsk; - struct audit_netlink_list *dest; - int err = 0; - struct audit_entry *entry; - - switch (type) { - case AUDIT_LIST: - case AUDIT_LIST_RULES: - /* We can't just spew out the rules here because we might fill - * the available socket buffer space and deadlock waiting for - * auditctl to read from it... which isn't ever going to - * happen if we're actually running in the context of auditctl - * trying to _send_ the stuff */ - - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); - if (!dest) - return -ENOMEM; - dest->pid = pid; - skb_queue_head_init(&dest->q); - - mutex_lock(&audit_filter_mutex); - if (type == AUDIT_LIST) - audit_list(pid, seq, &dest->q); - else - audit_list_rules(pid, seq, &dest->q); - mutex_unlock(&audit_filter_mutex); - - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); - if (IS_ERR(tsk)) { - skb_queue_purge(&dest->q); - kfree(dest); - err = PTR_ERR(tsk); - } - break; - case AUDIT_ADD: - case AUDIT_ADD_RULE: - if (type == AUDIT_ADD) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add rule", - &entry->rule, !err); - - if (err) - audit_free_rule(entry); - break; - case AUDIT_DEL: - case AUDIT_DEL_RULE: - if (type == AUDIT_DEL) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove rule", - &entry->rule, !err); - - audit_free_rule(entry); - break; - default: - return -EINVAL; - } - - return err; -} - -int audit_comparator(u32 left, u32 op, u32 right) -{ - switch (op) { - case Audit_equal: - return (left == right); - case Audit_not_equal: - return (left != right); - case Audit_lt: - return (left < right); - case Audit_le: - return (left <= right); - case Audit_gt: - return (left > right); - case Audit_ge: - return (left >= right); - case Audit_bitmask: - return (left & right); - case Audit_bittest: - return ((left & right) == right); - default: - BUG(); - return 0; - } -} - -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) -{ - int dlen, plen; - const char *p; - - if (!dname || !path) - return 1; - - dlen = strlen(dname); - plen = strlen(path); - if (plen < dlen) - return 1; - - /* disregard trailing slashes */ - p = path + plen - 1; - while ((*p == '/') && (p > path)) - p--; - - /* find last path component */ - p = p - dlen + 1; - if (p < path) - return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } - - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; - return strncmp(p, dname, dlen); -} - -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_krule *rule, - enum audit_state *state) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - int result = 0; - u32 sid; - - switch (f->type) { - case AUDIT_PID: - result = audit_comparator(cb->creds.pid, f->op, f->val); - break; - case AUDIT_UID: - result = audit_comparator(cb->creds.uid, f->op, f->val); - break; - case AUDIT_GID: - result = audit_comparator(cb->creds.gid, f->op, f->val); - break; - case AUDIT_LOGINUID: - result = audit_comparator(audit_get_loginuid(current), - f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - if (f->lsm_rule) { - security_task_getsecid(current, &sid); - result = security_audit_rule_match(sid, - f->type, - f->op, - f->lsm_rule, - NULL); - } - break; - } - - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(struct netlink_skb_parms *cb) -{ - enum audit_state state = AUDIT_DISABLED; - struct audit_entry *e; - int ret = 1; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { - if (state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; /* Audit by default */ -} - -int audit_filter_type(int type) -{ - struct audit_entry *e; - int result = 0; - - rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) - goto unlock_and_return; - - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - int i; - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_MSGTYPE) { - result = audit_comparator(type, f->op, f->val); - if (!result) - break; - } - } - if (result) - goto unlock_and_return; - } -unlock_and_return: - rcu_read_unlock(); - return result; -} - -static int update_lsm_rule(struct audit_krule *r) -{ - struct audit_entry *entry = container_of(r, struct audit_entry, rule); - struct audit_entry *nentry; - int err = 0; - - if (!security_audit_rule_known(r)) - return 0; - - nentry = audit_dupe_rule(r); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (r->watch) - list_del(&r->rlist); - list_del_rcu(&entry->list); - list_del(&r->list); - } else { - if (r->watch || r->tree) - list_replace_init(&r->rlist, &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - list_replace(&r->list, &nentry->rule.list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); - - return err; -} - -/* This function will re-initialize the lsm_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain LSM - * specific filter fields. When such a rule is found, it is copied, the - * LSM field is re-initialized, and the old rule is replaced with the - * updated rule. */ -int audit_update_lsm_rules(void) -{ - struct audit_krule *r, *n; - int i, err = 0; - - /* audit_filter_mutex synchronizes the writers */ - mutex_lock(&audit_filter_mutex); - - for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { - int res = update_lsm_rule(r); - if (!err) - err = res; - } - } - mutex_unlock(&audit_filter_mutex); - - return err; -} -/* auditsc.c -- System-call auditing support - * Handles all system-call specific auditing features. - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005, 2006 IBM Corporation - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Written by Rickard E. (Rik) Faith - * - * Many of the ideas implemented here are from Stephen C. Tweedie, - * especially the idea of avoiding a copy by using getname. - * - * The method for actual interception of syscall entry and exit (not in - * this file -- see entry.S) is based on a GPL'd patch written by - * okir@suse.de and Copyright 2003 SuSE Linux AG. - * - * POSIX message queue support added by George Wilson , - * 2006. - * - * The support of additional filter rules compares (>, <, >=, <=) was - * added by Dustin Kirkland , 2005. - * - * Modified by Amy Griffis to collect additional - * filesystem information. - * - * Subject and object context labeling support added by - * and for LSPP certification compliance. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "audit.h" - -/* flags stating the success for a syscall */ -#define AUDITSC_INVALID 0 -#define AUDITSC_SUCCESS 1 -#define AUDITSC_FAILURE 2 - -/* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). If we get more names we will allocate - * a name dynamically and also add those to the list anchored by names_list. */ -#define AUDIT_NAMES 5 - -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - -/* no execve audit message should be longer than this (userspace limits) */ -#define MAX_EXECVE_AUDIT_LEN 7500 - -/* number of audit rules */ -int audit_n_rules; - -/* determines whether we collect data for signals sent */ -int audit_signals; - -struct audit_cap_data { - kernel_cap_t permitted; - kernel_cap_t inheritable; - union { - unsigned int fE; /* effective bit of a file capability */ - kernel_cap_t effective; /* effective set of a process */ - }; -}; - -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). - * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ -struct audit_names { - struct list_head list; /* audit_context->names_list */ - const char *name; - unsigned long ino; - dev_t dev; - umode_t mode; - uid_t uid; - gid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - bool name_put; /* call __putname() for this name */ - /* - * This was an allocated audit_names and not from the array of - * names allocated in the task audit context. Thus this name - * should be freed on syscall exit - */ - bool should_free; -}; - -struct audit_aux_data { - struct audit_aux_data *next; - int type; -}; - -#define AUDIT_AUX_IPCPERM 0 - -/* Number of target pids per aux struct. */ -#define AUDIT_AUX_PIDS 16 - -struct audit_aux_data_execve { - struct audit_aux_data d; - int argc; - int envc; - struct mm_struct *mm; -}; - -struct audit_aux_data_pids { - struct audit_aux_data d; - pid_t target_pid[AUDIT_AUX_PIDS]; - uid_t target_auid[AUDIT_AUX_PIDS]; - uid_t target_uid[AUDIT_AUX_PIDS]; - unsigned int target_sessionid[AUDIT_AUX_PIDS]; - u32 target_sid[AUDIT_AUX_PIDS]; - char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; - int pid_count; -}; - -struct audit_aux_data_bprm_fcaps { - struct audit_aux_data d; - struct audit_cap_data fcap; - unsigned int fcap_ver; - struct audit_cap_data old_pcap; - struct audit_cap_data new_pcap; -}; - -struct audit_aux_data_capset { - struct audit_aux_data d; - pid_t pid; - struct audit_cap_data cap; -}; - -struct audit_tree_refs { - struct audit_tree_refs *next; - struct audit_chunk *c[31]; -}; - -/* The per-task audit context. */ -struct audit_context { - int dummy; /* must be the first element */ - int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ - int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ - unsigned long argv[4]; /* syscall arguments */ - long return_code;/* syscall return code */ - u64 prio; - int return_valid; /* return code is valid */ - /* - * The names_list is the list of all audit_names collected during this - * syscall. The first AUDIT_NAMES entries in the names_list will - * actually be from the preallocated_names array for performance - * reasons. Except during allocation they should never be referenced - * through the preallocated_names array and should only be found/used - * by running the names_list. - */ - struct audit_names preallocated_names[AUDIT_NAMES]; - int name_count; /* total records in names_list */ - struct list_head names_list; /* anchor for struct audit_names->list */ - char * filterkey; /* key for rule that triggered record */ - struct path pwd; - struct audit_context *previous; /* For nested syscalls */ - struct audit_aux_data *aux; - struct audit_aux_data *aux_pids; - struct sockaddr_storage *sockaddr; - size_t sockaddr_len; - /* Save things to print about task_struct */ - pid_t pid, ppid; - uid_t uid, euid, suid, fsuid; - gid_t gid, egid, sgid, fsgid; - unsigned long personality; - int arch; - - pid_t target_pid; - uid_t target_auid; - uid_t target_uid; - unsigned int target_sessionid; - u32 target_sid; - char target_comm[TASK_COMM_LEN]; - - struct audit_tree_refs *trees, *first_trees; - struct list_head killed_trees; - int tree_count; - - int type; - union { - struct { - int nargs; - long args[6]; - } socketcall; - struct { - uid_t uid; - gid_t gid; - umode_t mode; - u32 osid; - int has_perm; - uid_t perm_uid; - gid_t perm_gid; - umode_t perm_mode; - unsigned long qbytes; - } ipc; - struct { - mqd_t mqdes; - struct mq_attr mqstat; - } mq_getsetattr; - struct { - mqd_t mqdes; - int sigev_signo; - } mq_notify; - struct { - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; - } mq_sendrecv; - struct { - int oflag; - umode_t mode; - struct mq_attr attr; - } mq_open; - struct { - pid_t pid; - struct audit_cap_data cap; - } capset; - struct { - int fd; - int flags; - } mmap; - }; - int fds[2]; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif -}; - -static inline int open_arg(int flags, int mask) -{ - int n = ACC_MODE(flags); - if (flags & (O_TRUNC | O_CREAT)) - n |= AUDIT_PERM_WRITE; - return n & mask; -} - -static int audit_match_perm(struct audit_context *ctx, int mask) -{ - unsigned n; - if (unlikely(!ctx)) - return 0; - n = ctx->major; - - switch (audit_classify_syscall(ctx->arch, n)) { - case 0: /* native */ - if ((mask & AUDIT_PERM_WRITE) && - audit_match_class(AUDIT_CLASS_WRITE, n)) - return 1; - if ((mask & AUDIT_PERM_READ) && - audit_match_class(AUDIT_CLASS_READ, n)) - return 1; - if ((mask & AUDIT_PERM_ATTR) && - audit_match_class(AUDIT_CLASS_CHATTR, n)) - return 1; - return 0; - case 1: /* 32bit on biarch */ - if ((mask & AUDIT_PERM_WRITE) && - audit_match_class(AUDIT_CLASS_WRITE_32, n)) - return 1; - if ((mask & AUDIT_PERM_READ) && - audit_match_class(AUDIT_CLASS_READ_32, n)) - return 1; - if ((mask & AUDIT_PERM_ATTR) && - audit_match_class(AUDIT_CLASS_CHATTR_32, n)) - return 1; - return 0; - case 2: /* open */ - return mask & ACC_MODE(ctx->argv[1]); - case 3: /* openat */ - return mask & ACC_MODE(ctx->argv[2]); - case 4: /* socketcall */ - return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); - case 5: /* execve */ - return mask & AUDIT_PERM_EXEC; - default: - return 0; - } -} - -static int audit_match_filetype(struct audit_context *ctx, int val) -{ - struct audit_names *n; - umode_t mode = (umode_t)val; - - if (unlikely(!ctx)) - return 0; - - list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && - ((n->mode & S_IFMT) == mode)) - return 1; - } - - return 0; -} - -/* - * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; - * ->first_trees points to its beginning, ->trees - to the current end of data. - * ->tree_count is the number of free entries in array pointed to by ->trees. - * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, - * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, - * it's going to remain 1-element for almost any setup) until we free context itself. - * References in it _are_ dropped - at the same time we free/drop aux stuff. - */ - -#ifdef CONFIG_AUDIT_TREE -static void audit_set_auditable(struct audit_context *ctx) -{ - if (!ctx->prio) { - ctx->prio = 1; - ctx->current_state = AUDIT_RECORD_CONTEXT; - } -} - -static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) -{ - struct audit_tree_refs *p = ctx->trees; - int left = ctx->tree_count; - if (likely(left)) { - p->c[--left] = chunk; - ctx->tree_count = left; - return 1; - } - if (!p) - return 0; - p = p->next; - if (p) { - p->c[30] = chunk; - ctx->trees = p; - ctx->tree_count = 30; - return 1; - } - return 0; -} - -static int grow_tree_refs(struct audit_context *ctx) -{ - struct audit_tree_refs *p = ctx->trees; - ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); - if (!ctx->trees) { - ctx->trees = p; - return 0; - } - if (p) - p->next = ctx->trees; - else - ctx->first_trees = ctx->trees; - ctx->tree_count = 31; - return 1; -} -#endif - -static void unroll_tree_refs(struct audit_context *ctx, - struct audit_tree_refs *p, int count) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_tree_refs *q; - int n; - if (!p) { - /* we started with empty chain */ - p = ctx->first_trees; - count = 31; - /* if the very first allocation has failed, nothing to do */ - if (!p) - return; - } - n = count; - for (q = p; q != ctx->trees; q = q->next, n = 31) { - while (n--) { - audit_put_chunk(q->c[n]); - q->c[n] = NULL; - } - } - while (n-- > ctx->tree_count) { - audit_put_chunk(q->c[n]); - q->c[n] = NULL; - } - ctx->trees = p; - ctx->tree_count = count; -#endif -} - -static void free_tree_refs(struct audit_context *ctx) -{ - struct audit_tree_refs *p, *q; - for (p = ctx->first_trees; p; p = q) { - q = p->next; - kfree(p); - } -} - -static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_tree_refs *p; - int n; - if (!tree) - return 0; - /* full ones */ - for (p = ctx->first_trees; p != ctx->trees; p = p->next) { - for (n = 0; n < 31; n++) - if (audit_tree_match(p->c[n], tree)) - return 1; - } - /* partial */ - if (p) { - for (n = ctx->tree_count; n < 31; n++) - if (audit_tree_match(p->c[n], tree)) - return 1; - } -#endif - return 0; -} - -static int audit_compare_id(uid_t uid1, - struct audit_names *name, - unsigned long name_offset, - struct audit_field *f, - struct audit_context *ctx) -{ - struct audit_names *n; - unsigned long addr; - uid_t uid2; - int rc; - - BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - - if (name) { - addr = (unsigned long)name; - addr += name_offset; - - uid2 = *(uid_t *)addr; - rc = audit_comparator(uid1, f->op, uid2); - if (rc) - return rc; - } - - if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - addr = (unsigned long)n; - addr += name_offset; - - uid2 = *(uid_t *)addr; - - rc = audit_comparator(uid1, f->op, uid2); - if (rc) - return rc; - } - } - return 0; -} - -static int audit_field_compare(struct task_struct *tsk, - const struct cred *cred, - struct audit_field *f, - struct audit_context *ctx, - struct audit_names *name) -{ - switch (f->val) { - /* process to file object comparisons */ - case AUDIT_COMPARE_UID_TO_OBJ_UID: - return audit_compare_id(cred->uid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_GID_TO_OBJ_GID: - return audit_compare_id(cred->gid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_EUID_TO_OBJ_UID: - return audit_compare_id(cred->euid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_EGID_TO_OBJ_GID: - return audit_compare_id(cred->egid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_id(tsk->loginuid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_SUID_TO_OBJ_UID: - return audit_compare_id(cred->suid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_SGID_TO_OBJ_GID: - return audit_compare_id(cred->sgid, - name, offsetof(struct audit_names, gid), - f, ctx); - case AUDIT_COMPARE_FSUID_TO_OBJ_UID: - return audit_compare_id(cred->fsuid, - name, offsetof(struct audit_names, uid), - f, ctx); - case AUDIT_COMPARE_FSGID_TO_OBJ_GID: - return audit_compare_id(cred->fsgid, - name, offsetof(struct audit_names, gid), - f, ctx); - /* uid comparisons */ - case AUDIT_COMPARE_UID_TO_AUID: - return audit_comparator(cred->uid, f->op, tsk->loginuid); - case AUDIT_COMPARE_UID_TO_EUID: - return audit_comparator(cred->uid, f->op, cred->euid); - case AUDIT_COMPARE_UID_TO_SUID: - return audit_comparator(cred->uid, f->op, cred->suid); - case AUDIT_COMPARE_UID_TO_FSUID: - return audit_comparator(cred->uid, f->op, cred->fsuid); - /* auid comparisons */ - case AUDIT_COMPARE_AUID_TO_EUID: - return audit_comparator(tsk->loginuid, f->op, cred->euid); - case AUDIT_COMPARE_AUID_TO_SUID: - return audit_comparator(tsk->loginuid, f->op, cred->suid); - case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_comparator(tsk->loginuid, f->op, cred->fsuid); - /* euid comparisons */ - case AUDIT_COMPARE_EUID_TO_SUID: - return audit_comparator(cred->euid, f->op, cred->suid); - case AUDIT_COMPARE_EUID_TO_FSUID: - return audit_comparator(cred->euid, f->op, cred->fsuid); - /* suid comparisons */ - case AUDIT_COMPARE_SUID_TO_FSUID: - return audit_comparator(cred->suid, f->op, cred->fsuid); - /* gid comparisons */ - case AUDIT_COMPARE_GID_TO_EGID: - return audit_comparator(cred->gid, f->op, cred->egid); - case AUDIT_COMPARE_GID_TO_SGID: - return audit_comparator(cred->gid, f->op, cred->sgid); - case AUDIT_COMPARE_GID_TO_FSGID: - return audit_comparator(cred->gid, f->op, cred->fsgid); - /* egid comparisons */ - case AUDIT_COMPARE_EGID_TO_SGID: - return audit_comparator(cred->egid, f->op, cred->sgid); - case AUDIT_COMPARE_EGID_TO_FSGID: - return audit_comparator(cred->egid, f->op, cred->fsgid); - /* sgid comparison */ - case AUDIT_COMPARE_SGID_TO_FSGID: - return audit_comparator(cred->sgid, f->op, cred->fsgid); - default: - WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); - return 0; - } - return 0; -} - -/* Determine if any context name data matches a rule's watch data */ -/* Compare a task_struct with an audit_rule. Return 1 on match, 0 - * otherwise. - * - * If task_creation is true, this is an explicit indication that we are - * filtering a task rule at task creation time. This and tsk == current are - * the only situations where tsk->cred may be accessed without an rcu read lock. - */ -static int audit_filter_rules(struct task_struct *tsk, - struct audit_krule *rule, - struct audit_context *ctx, - struct audit_names *name, - enum audit_state *state, - bool task_creation) -{ - const struct cred *cred; - int i, need_sid = 1; - u32 sid; - - cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - struct audit_names *n; - int result = 0; - - switch (f->type) { - case AUDIT_PID: - result = audit_comparator(tsk->pid, f->op, f->val); - break; - case AUDIT_PPID: - if (ctx) { - if (!ctx->ppid) - ctx->ppid = sys_getppid(); - result = audit_comparator(ctx->ppid, f->op, f->val); - } - break; - case AUDIT_UID: - result = audit_comparator(cred->uid, f->op, f->val); - break; - case AUDIT_EUID: - result = audit_comparator(cred->euid, f->op, f->val); - break; - case AUDIT_SUID: - result = audit_comparator(cred->suid, f->op, f->val); - break; - case AUDIT_FSUID: - result = audit_comparator(cred->fsuid, f->op, f->val); - break; - case AUDIT_GID: - result = audit_comparator(cred->gid, f->op, f->val); - break; - case AUDIT_EGID: - result = audit_comparator(cred->egid, f->op, f->val); - break; - case AUDIT_SGID: - result = audit_comparator(cred->sgid, f->op, f->val); - break; - case AUDIT_FSGID: - result = audit_comparator(cred->fsgid, f->op, f->val); - break; - case AUDIT_PERS: - result = audit_comparator(tsk->personality, f->op, f->val); - break; - case AUDIT_ARCH: - if (ctx) - result = audit_comparator(ctx->arch, f->op, f->val); - break; - - case AUDIT_EXIT: - if (ctx && ctx->return_valid) - result = audit_comparator(ctx->return_code, f->op, f->val); - break; - case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) { - if (f->val) - result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); - else - result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); - } - break; - case AUDIT_DEVMAJOR: - if (name) { - if (audit_comparator(MAJOR(name->dev), f->op, f->val) || - audit_comparator(MAJOR(name->rdev), f->op, f->val)) - ++result; - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MAJOR(n->dev), f->op, f->val) || - audit_comparator(MAJOR(n->rdev), f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_DEVMINOR: - if (name) { - if (audit_comparator(MINOR(name->dev), f->op, f->val) || - audit_comparator(MINOR(name->rdev), f->op, f->val)) - ++result; - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MINOR(n->dev), f->op, f->val) || - audit_comparator(MINOR(n->rdev), f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_INODE: - if (name) - result = (name->ino == f->val); - else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->ino, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_OBJ_UID: - if (name) { - result = audit_comparator(name->uid, f->op, f->val); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->uid, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_OBJ_GID: - if (name) { - result = audit_comparator(name->gid, f->op, f->val); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(n->gid, f->op, f->val)) { - ++result; - break; - } - } - } - break; - case AUDIT_WATCH: - if (name) - result = audit_watch_compare(rule->watch, name->ino, name->dev); - break; - case AUDIT_DIR: - if (ctx) - result = match_tree_refs(ctx, rule->tree); - break; - case AUDIT_LOGINUID: - result = 0; - if (ctx) - result = audit_comparator(tsk->loginuid, f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - /* NOTE: this may return negative values indicating - a temporary error. We simply treat this as a - match for now to avoid losing information that - may be wanted. An error message will also be - logged upon error */ - if (f->lsm_rule) { - if (need_sid) { - security_task_getsecid(tsk, &sid); - need_sid = 0; - } - result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); - } - break; - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR - also applies here */ - if (f->lsm_rule) { - /* Find files that match */ - if (name) { - result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { - ++result; - break; - } - } - } - /* Find ipc objects that match */ - if (!ctx || ctx->type != AUDIT_IPC) - break; - if (security_audit_rule_match(ctx->ipc.osid, - f->type, f->op, - f->lsm_rule, ctx)) - ++result; - } - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - if (ctx) - result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); - break; - case AUDIT_FILTERKEY: - /* ignore this field for filtering */ - result = 1; - break; - case AUDIT_PERM: - result = audit_match_perm(ctx, f->val); - break; - case AUDIT_FILETYPE: - result = audit_match_filetype(ctx, f->val); - break; - case AUDIT_FIELD_COMPARE: - result = audit_field_compare(tsk, cred, f, ctx, name); - break; - } - if (!result) - return 0; - } - - if (ctx) { - if (rule->prio <= ctx->prio) - return 0; - if (rule->filterkey) { - kfree(ctx->filterkey); - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); - } - ctx->prio = rule->prio; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -/* At process creation time, we can determine if system-call auditing is - * completely disabled for this task. Since we only have the task - * structure at this point, we can only check uid and gid. - */ -static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) -{ - struct audit_entry *e; - enum audit_state state; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, NULL, - &state, true)) { - if (state == AUDIT_RECORD_CONTEXT) - *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); - rcu_read_unlock(); - return state; - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -/* At syscall entry and exit time, this filter is called if the - * audit_state is not low enough that auditing cannot take place, but is - * also not high enough that we already know we have to write an audit - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). - */ -static enum audit_state audit_filter_syscall(struct task_struct *tsk, - struct audit_context *ctx, - struct list_head *list) -{ - struct audit_entry *e; - enum audit_state state; - - if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; - - rcu_read_lock(); - if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return state; - } - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -/* - * Given an audit_name check the inode hash table to see if they match. - * Called holding the rcu read lock to protect the use of audit_inode_hash - */ -static int audit_filter_inode_name(struct task_struct *tsk, - struct audit_names *n, - struct audit_context *ctx) { - int word, bit; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - struct audit_entry *e; - enum audit_state state; - - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - - if (list_empty(list)) - return 0; - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { - ctx->current_state = state; - return 1; - } - } - - return 0; -} - -/* At syscall exit time, this filter is called if any audit_names have been - * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names. - * Regarding audit_state, same rules apply as for audit_filter_syscall(). - */ -void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) -{ - struct audit_names *n; - - if (audit_pid && tsk->tgid == audit_pid) - return; - - rcu_read_lock(); - - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_filter_inode_name(tsk, n, ctx)) - break; - } - rcu_read_unlock(); -} - -static inline struct audit_context *audit_get_context(struct task_struct *tsk, - int return_valid, - long return_code) -{ - struct audit_context *context = tsk->audit_context; - - if (!context) - return NULL; - context->return_valid = return_valid; - - /* - * we need to fix up the return code in the audit logs if the actual - * return codes are later going to be fixed up by the arch specific - * signal handlers - * - * This is actually a test for: - * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || - * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) - * - * but is faster than a bunch of || - */ - if (unlikely(return_code <= -ERESTARTSYS) && - (return_code >= -ERESTART_RESTARTBLOCK) && - (return_code != -ENOIOCTLCMD)) - context->return_code = -EINTR; - else - context->return_code = return_code; - - if (context->in_syscall && !context->dummy) { - audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - audit_filter_inodes(tsk, context); - } - - tsk->audit_context = NULL; - return context; -} - -static inline void audit_free_names(struct audit_context *context) -{ - struct audit_names *n, *next; - -#if AUDIT_DEBUG == 2 - if (context->put_count + context->ino_count != context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d" - " ino_count=%d [NOT freeing]\n", - __FILE__, __LINE__, - context->serial, context->major, context->in_syscall, - context->name_count, context->put_count, - context->ino_count); - list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); - } - dump_stack(); - return; - } -#endif -#if AUDIT_DEBUG - context->put_count = 0; - context->ino_count = 0; -#endif - - list_for_each_entry_safe(n, next, &context->names_list, list) { - list_del(&n->list); - if (n->name && n->name_put) - __putname(n->name); - if (n->should_free) - kfree(n); - } - context->name_count = 0; - path_put(&context->pwd); - context->pwd.dentry = NULL; - context->pwd.mnt = NULL; -} - -static inline void audit_free_aux(struct audit_context *context) -{ - struct audit_aux_data *aux; - - while ((aux = context->aux)) { - context->aux = aux->next; - kfree(aux); - } - while ((aux = context->aux_pids)) { - context->aux_pids = aux->next; - kfree(aux); - } -} - -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - -static inline struct audit_context *audit_alloc_context(enum audit_state state) -{ - struct audit_context *context; - - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) - return NULL; - audit_zero_context(context, state); - INIT_LIST_HEAD(&context->killed_trees); - INIT_LIST_HEAD(&context->names_list); - return context; -} - -/** - * audit_alloc - allocate an audit context block for a task - * @tsk: task - * - * Filter on the task information and allocate a per-task audit context - * if necessary. Doing so turns on system call auditing for the - * specified task. This is called from copy_process, so no lock is - * needed. - */ -int audit_alloc(struct task_struct *tsk) -{ - struct audit_context *context; - enum audit_state state; - char *key = NULL; - - if (likely(!audit_ever_enabled)) - return 0; /* Return if not auditing. */ - - state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) - return 0; - - if (!(context = audit_alloc_context(state))) { - kfree(key); - audit_log_lost("out of memory in audit_alloc"); - return -ENOMEM; - } - context->filterkey = key; - - tsk->audit_context = context; - set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); - return 0; -} - -static inline void audit_free_context(struct audit_context *context) -{ - struct audit_context *previous; - int count = 0; - - do { - previous = context->previous; - if (previous || (count && count < 10)) { - ++count; - printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" - " freeing multiple contexts (%d)\n", - context->serial, context->major, - context->name_count, count); - } - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - free_tree_refs(context); - audit_free_aux(context); - kfree(context->filterkey); - kfree(context->sockaddr); - kfree(context); - context = previous; - } while (context); - if (count >= 10) - printk(KERN_ERR "audit: freed %d contexts\n", count); -} - -void audit_log_task_context(struct audit_buffer *ab) -{ - char *ctx = NULL; - unsigned len; - int error; - u32 sid; - - security_task_getsecid(current, &sid); - if (!sid) - return; - - error = security_secid_to_secctx(sid, &ctx, &len); - if (error) { - if (error != -EINVAL) - goto error_path; - return; - } - - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - return; - -error_path: - audit_panic("error in audit_log_task_context"); - return; -} - -EXPORT_SYMBOL(audit_log_task_context); - -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) -{ - char name[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; - - /* tsk == current */ - - get_task_comm(name, tsk); - audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); - - if (mm) { - down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, " exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - audit_log_task_context(ab); -} - -static int audit_log_pid_context(struct audit_context *context, pid_t pid, - uid_t auid, uid_t uid, unsigned int sessionid, - u32 sid, char *comm) -{ - struct audit_buffer *ab; - char *ctx = NULL; - u32 len; - int rc = 0; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); - if (!ab) - return rc; - - audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, - uid, sessionid); - if (security_secid_to_secctx(sid, &ctx, &len)) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - audit_log_format(ab, " ocomm="); - audit_log_untrustedstring(ab, comm); - audit_log_end(ab); - - return rc; -} - -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab, - struct audit_aux_data_execve *axi) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; - - if (axi->mm != current->mm) - return; /* execve failed, no additional info */ - - p = (const char __user *)axi->mm->arg_start; - - audit_log_format(*ab, "argc=%d", axi->argc); - - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string\n"); - return; - } - - for (i = 0; i < axi->argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); -} - -static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); - } -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); -} - -static void show_special(struct audit_context *context, int *call_panic) -{ - struct audit_buffer *ab; - int i; - - ab = audit_log_start(context, GFP_KERNEL, context->type); - if (!ab) - return; - - switch (context->type) { - case AUDIT_SOCKETCALL: { - int nargs = context->socketcall.nargs; - audit_log_format(ab, "nargs=%d", nargs); - for (i = 0; i < nargs; i++) - audit_log_format(ab, " a%d=%lx", i, - context->socketcall.args[i]); - break; } - case AUDIT_IPC: { - u32 osid = context->ipc.osid; - - audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", - context->ipc.uid, context->ipc.gid, context->ipc.mode); - if (osid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", osid); - *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - if (context->ipc.has_perm) { - audit_log_end(ab); - ab = audit_log_start(context, GFP_KERNEL, - AUDIT_IPC_SET_PERM); - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#ho", - context->ipc.qbytes, - context->ipc.perm_uid, - context->ipc.perm_gid, - context->ipc.perm_mode); - if (!ab) - return; - } - break; } - case AUDIT_MQ_OPEN: { - audit_log_format(ab, - "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - context->mq_open.oflag, context->mq_open.mode, - context->mq_open.attr.mq_flags, - context->mq_open.attr.mq_maxmsg, - context->mq_open.attr.mq_msgsize, - context->mq_open.attr.mq_curmsgs); - break; } - case AUDIT_MQ_SENDRECV: { - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - context->mq_sendrecv.mqdes, - context->mq_sendrecv.msg_len, - context->mq_sendrecv.msg_prio, - context->mq_sendrecv.abs_timeout.tv_sec, - context->mq_sendrecv.abs_timeout.tv_nsec); - break; } - case AUDIT_MQ_NOTIFY: { - audit_log_format(ab, "mqdes=%d sigev_signo=%d", - context->mq_notify.mqdes, - context->mq_notify.sigev_signo); - break; } - case AUDIT_MQ_GETSETATTR: { - struct mq_attr *attr = &context->mq_getsetattr.mqstat; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - context->mq_getsetattr.mqdes, - attr->mq_flags, attr->mq_maxmsg, - attr->mq_msgsize, attr->mq_curmsgs); - break; } - case AUDIT_CAPSET: { - audit_log_format(ab, "pid=%d", context->capset.pid); - audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); - audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); - audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); - break; } - case AUDIT_MMAP: { - audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, - context->mmap.flags); - break; } - } - audit_log_end(ab); -} - -static void audit_log_name(struct audit_context *context, struct audit_names *n, - int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", record_num); - - if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); -} - -static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) -{ - const struct cred *cred; - int i, call_panic = 0; - struct audit_buffer *ab; - struct audit_aux_data *aux; - const char *tty; - struct audit_names *n; - - /* tsk == current */ - context->pid = tsk->pid; - if (!context->ppid) - context->ppid = sys_getppid(); - cred = current_cred(); - context->uid = cred->uid; - context->gid = cred->gid; - context->euid = cred->euid; - context->suid = cred->suid; - context->fsuid = cred->fsuid; - context->egid = cred->egid; - context->sgid = cred->sgid; - context->fsgid = cred->fsgid; - context->personality = tsk->personality; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); - if (!ab) - return; /* audit_panic has been called */ - audit_log_format(ab, "arch=%x syscall=%d", - context->arch, context->major); - if (context->personality != PER_LINUX) - audit_log_format(ab, " per=%lx", context->personality); - if (context->return_valid) - audit_log_format(ab, " success=%s exit=%ld", - (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", - context->return_code); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - - audit_log_format(ab, - " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " ppid=%d pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - context->argv[0], - context->argv[1], - context->argv[2], - context->argv[3], - context->name_count, - context->ppid, - context->pid, - tsk->loginuid, - context->uid, - context->gid, - context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid, tty, - tsk->sessionid); - - - audit_log_task_info(ab, tsk); - audit_log_key(ab, context->filterkey); - audit_log_end(ab); - - for (aux = context->aux; aux; aux = aux->next) { - - ab = audit_log_start(context, GFP_KERNEL, aux->type); - if (!ab) - continue; /* audit_panic has been called */ - - switch (aux->type) { - - case AUDIT_EXECVE: { - struct audit_aux_data_execve *axi = (void *)aux; - audit_log_execve_info(context, &ab, axi); - break; } - - case AUDIT_BPRM_FCAPS: { - struct audit_aux_data_bprm_fcaps *axs = (void *)aux; - audit_log_format(ab, "fver=%x", axs->fcap_ver); - audit_log_cap(ab, "fp", &axs->fcap.permitted); - audit_log_cap(ab, "fi", &axs->fcap.inheritable); - audit_log_format(ab, " fe=%d", axs->fcap.fE); - audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); - audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); - audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); - audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); - audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); - audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); - break; } - - } - audit_log_end(ab); - } - - if (context->type) - show_special(context, &call_panic); - - if (context->fds[0] >= 0) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); - if (ab) { - audit_log_format(ab, "fd0=%d fd1=%d", - context->fds[0], context->fds[1]); - audit_log_end(ab); - } - } - - if (context->sockaddr_len) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); - if (ab) { - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, (void *)context->sockaddr, - context->sockaddr_len); - audit_log_end(ab); - } - } - - for (aux = context->aux_pids; aux; aux = aux->next) { - struct audit_aux_data_pids *axs = (void *)aux; - - for (i = 0; i < axs->pid_count; i++) - if (audit_log_pid_context(context, axs->target_pid[i], - axs->target_auid[i], - axs->target_uid[i], - axs->target_sessionid[i], - axs->target_sid[i], - axs->target_comm[i])) - call_panic = 1; - } - - if (context->target_pid && - audit_log_pid_context(context, context->target_pid, - context->target_auid, context->target_uid, - context->target_sessionid, - context->target_sid, context->target_comm)) - call_panic = 1; - - if (context->pwd.dentry && context->pwd.mnt) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); - if (ab) { - audit_log_d_path(ab, " cwd=", &context->pwd); - audit_log_end(ab); - } - } - - i = 0; - list_for_each_entry(n, &context->names_list, list) - audit_log_name(context, n, i++, &call_panic); - - /* Send end of event record to help user space know we are finished */ - ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); - if (ab) - audit_log_end(ab); - if (call_panic) - audit_panic("error converting sid to string"); -} - -/** - * audit_free - free a per-task audit context - * @tsk: task whose audit context block to free - * - * Called from copy_process and do_exit - */ -void __audit_free(struct task_struct *tsk) -{ - struct audit_context *context; - - context = audit_get_context(tsk, 0, 0); - if (!context) - return; - - /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. - * We use GFP_ATOMIC here because we might be doing this - * in the context of the idle thread */ - /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - - audit_free_context(context); -} - -/** - * audit_syscall_entry - fill in an audit record at syscall entry - * @arch: architecture type - * @major: major syscall type (function) - * @a1: additional syscall register 1 - * @a2: additional syscall register 2 - * @a3: additional syscall register 3 - * @a4: additional syscall register 4 - * - * Fill in audit context at syscall entry. This only happens if the - * audit context was created when the task was created and the state or - * filters demand the audit context be built. If the state from the - * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, - * then the record will be written at syscall exit time (otherwise, it - * will only be written if another part of the kernel requests that it - * be written). - */ -void __audit_syscall_entry(int arch, int major, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4) -{ - struct task_struct *tsk = current; - struct audit_context *context = tsk->audit_context; - enum audit_state state; - - if (!context) - return; - - /* - * This happens only on certain architectures that make system - * calls in kernel_thread via the entry.S interface, instead of - * with direct calls. (If you are porting to a new - * architecture, hitting this condition can indicate that you - * got the _exit/_leave calls backward in entry.S.) - * - * i386 no - * x86_64 no - * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) - * - * This also happens with vm86 emulation in a non-nested manner - * (entries without exits), so this case must be caught. - */ - if (context->in_syscall) { - struct audit_context *newctx; - -#if AUDIT_DEBUG - printk(KERN_ERR - "audit(:%d) pid=%d in syscall=%d;" - " entering syscall=%d\n", - context->serial, tsk->pid, context->major, major); -#endif - newctx = audit_alloc_context(context->state); - if (newctx) { - newctx->previous = context; - context = newctx; - tsk->audit_context = newctx; - } else { - /* If we can't alloc a new context, the best we - * can do is to leak memory (any pending putname - * will be lost). The only other alternative is - * to abandon auditing. */ - audit_zero_context(context, context->state); - } - } - BUG_ON(context->in_syscall || context->name_count); - - if (!audit_enabled) - return; - - context->arch = arch; - context->major = major; - context->argv[0] = a1; - context->argv[1] = a2; - context->argv[2] = a3; - context->argv[3] = a4; - - state = context->state; - context->dummy = !audit_n_rules; - if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { - context->prio = 0; - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); - } - if (state == AUDIT_DISABLED) - return; - - context->serial = 0; - context->ctime = CURRENT_TIME; - context->in_syscall = 1; - context->current_state = state; - context->ppid = 0; -} - -/** - * audit_syscall_exit - deallocate audit context after a system call - * @success: success value of the syscall - * @return_code: return value of the syscall - * - * Tear down after system call. If the audit context has been marked as - * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel wrote an audit - * message), then write out the syscall information. In call cases, - * free the names stored from getname(). - */ -void __audit_syscall_exit(int success, long return_code) -{ - struct task_struct *tsk = current; - struct audit_context *context; - - if (success) - success = AUDITSC_SUCCESS; - else - success = AUDITSC_FAILURE; - - context = audit_get_context(tsk, success, return_code); - if (!context) - return; - - if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); - - context->in_syscall = 0; - context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; - - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - - if (context->previous) { - struct audit_context *new_context = context->previous; - context->previous = NULL; - audit_free_context(context); - tsk->audit_context = new_context; - } else { - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - audit_free_aux(context); - context->aux = NULL; - context->aux_pids = NULL; - context->target_pid = 0; - context->target_sid = 0; - context->sockaddr_len = 0; - context->type = 0; - context->fds[0] = -1; - if (context->state != AUDIT_RECORD_CONTEXT) { - kfree(context->filterkey); - context->filterkey = NULL; - } - tsk->audit_context = context; - } -} - -static inline void handle_one(const struct inode *inode) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_context *context; - struct audit_tree_refs *p; - struct audit_chunk *chunk; - int count; - if (likely(hlist_empty(&inode->i_fsnotify_marks))) - return; - context = current->audit_context; - p = context->trees; - count = context->tree_count; - rcu_read_lock(); - chunk = audit_tree_lookup(inode); - rcu_read_unlock(); - if (!chunk) - return; - if (likely(put_tree_ref(context, chunk))) - return; - if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); - audit_set_auditable(context); - audit_put_chunk(chunk); - unroll_tree_refs(context, p, count); - return; - } - put_tree_ref(context, chunk); -#endif -} - -static void handle_path(const struct dentry *dentry) -{ -#ifdef CONFIG_AUDIT_TREE - struct audit_context *context; - struct audit_tree_refs *p; - const struct dentry *d, *parent; - struct audit_chunk *drop; - unsigned long seq; - int count; - - context = current->audit_context; - p = context->trees; - count = context->tree_count; -retry: - drop = NULL; - d = dentry; - rcu_read_lock(); - seq = read_seqbegin(&rename_lock); - for(;;) { - struct inode *inode = d->d_inode; - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { - struct audit_chunk *chunk; - chunk = audit_tree_lookup(inode); - if (chunk) { - if (unlikely(!put_tree_ref(context, chunk))) { - drop = chunk; - break; - } - } - } - parent = d->d_parent; - if (parent == d) - break; - d = parent; - } - if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ - rcu_read_unlock(); - if (!drop) { - /* just a race with rename */ - unroll_tree_refs(context, p, count); - goto retry; - } - audit_put_chunk(drop); - if (grow_tree_refs(context)) { - /* OK, got more space */ - unroll_tree_refs(context, p, count); - goto retry; - } - /* too bad */ - printk(KERN_WARNING - "out of memory, audit has lost a tree reference\n"); - unroll_tree_refs(context, p, count); - audit_set_auditable(context); - return; - } - rcu_read_unlock(); -#endif -} - -static struct audit_names *audit_alloc_name(struct audit_context *context) -{ - struct audit_names *aname; - - if (context->name_count < AUDIT_NAMES) { - aname = &context->preallocated_names[context->name_count]; - memset(aname, 0, sizeof(*aname)); - } else { - aname = kzalloc(sizeof(*aname), GFP_NOFS); - if (!aname) - return NULL; - aname->should_free = true; - } - - aname->ino = (unsigned long)-1; - list_add_tail(&aname->list, &context->names_list); - - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return aname; -} - -/** - * audit_getname - add a name to the list - * @name: name to add - * - * Add a name to the list of audit names for this context. - * Called from fs/namei.c:getname(). - */ -void __audit_getname(const char *name) -{ - struct audit_context *context = current->audit_context; - struct audit_names *n; - - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", - __FILE__, __LINE__, context->serial, name); - dump_stack(); -#endif - return; - } - - n = audit_alloc_name(context); - if (!n) - return; - - n->name = name; - n->name_len = AUDIT_NAME_FULL; - n->name_put = true; - - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); -} - -/* audit_putname - intercept a putname request - * @name: name to intercept and delay for putname - * - * If we have stored the name from getname in the audit context, - * then we delay the putname until syscall exit. - * Called from include/linux/fs.h:putname(). - */ -void audit_putname(const char *name) -{ - struct audit_context *context = current->audit_context; - - BUG_ON(!context); - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", - __FILE__, __LINE__, context->serial, name); - if (context->name_count) { - struct audit_names *n; - int i; - - list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); - } -#endif - __putname(name); - } -#if AUDIT_DEBUG - else { - ++context->put_count; - if (context->put_count > context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d" - " in_syscall=%d putname(%p) name_count=%d" - " put_count=%d\n", - __FILE__, __LINE__, - context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); - dump_stack(); - } - } -#endif -} - -static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - - return 0; -} - - -/* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - -/** - * audit_inode - store the inode and device from a lookup - * @name: name being audited - * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). - */ -void __audit_inode(const char *name, const struct dentry *dentry) -{ - struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; - struct audit_names *n; - - if (!context->in_syscall) - return; - - list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) - goto out; - } - - /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); - if (!n) - return; -out: - handle_path(dentry); - audit_copy_inode(n, dentry, inode); -} - -/** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited - * @parent: inode of dentry parent - * - * For syscalls that create or remove filesystem objects, audit_inode - * can only collect information for the filesystem object's parent. - * This call updates the audit context with the child's information. - * Syscalls that create a new filesystem object must be hooked after - * the object is created. Syscalls that remove a filesystem object - * must be hooked prior, in order to capture the target inode during - * unsuccessful attempts. - */ -void __audit_inode_child(const struct dentry *dentry, - const struct inode *parent) -{ - struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; - const struct inode *inode = dentry->d_inode; - const char *dname = dentry->d_name.name; - struct audit_names *n; - int dirlen = 0; - - if (!context->in_syscall) - return; - - if (inode) - handle_one(inode); - - /* parent is more likely, look for it first */ - list_for_each_entry(n, &context->names_list, list) { - if (!n->name) - continue; - - if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ - found_parent = n->name; - goto add_names; - } - } - - /* no matching parent, look for matching child */ - list_for_each_entry(n, &context->names_list, list) { - if (!n->name) - continue; - - /* strcmp() is the more likely scenario */ - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { - if (inode) - audit_copy_inode(n, NULL, inode); - else - n->ino = (unsigned long)-1; - found_child = n->name; - goto add_names; - } - } - -add_names: - if (!found_parent) { - n = audit_alloc_name(context); - if (!n) - return; - audit_copy_inode(n, NULL, parent); - } - - if (!found_child) { - n = audit_alloc_name(context); - if (!n) - return; - - /* Re-use the name belonging to the slot for a matching parent - * directory. All names for this context are relinquished in - * audit_free_names() */ - if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; - /* don't call __putname() */ - n->name_put = false; - } - - if (inode) - audit_copy_inode(n, NULL, inode); - } -} -EXPORT_SYMBOL_GPL(__audit_inode_child); - -/** - * auditsc_get_stamp - get local copies of audit_context values - * @ctx: audit_context for the task - * @t: timespec to store time recorded in the audit_context - * @serial: serial value that is recorded in the audit_context - * - * Also sets the context as auditable. - */ -int auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) -{ - if (!ctx->in_syscall) - return 0; - if (!ctx->serial) - ctx->serial = audit_serial(); - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; - if (!ctx->prio) { - ctx->prio = 1; - ctx->current_state = AUDIT_RECORD_CONTEXT; - } - return 1; -} - -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -/** - * audit_set_loginuid - set current task's audit_context loginuid - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(uid_t loginuid) -{ - struct task_struct *task = current; - struct audit_context *context = task->audit_context; - unsigned int sessionid; - -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (task->loginuid != -1) - return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - - sessionid = atomic_inc_return(&session_id); - if (context && context->in_syscall) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u" - " old ses=%u new ses=%u", - task->pid, task_uid(task), - task->loginuid, loginuid, - task->sessionid, sessionid); - audit_log_end(ab); - } - } - task->sessionid = sessionid; - task->loginuid = loginuid; - return 0; -} - -/** - * __audit_mq_open - record audit data for a POSIX MQ open - * @oflag: open flag - * @mode: mode bits - * @attr: queue attributes - * - */ -void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) -{ - struct audit_context *context = current->audit_context; - - if (attr) - memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); - else - memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); - - context->mq_open.oflag = oflag; - context->mq_open.mode = mode; - - context->type = AUDIT_MQ_OPEN; -} - -/** - * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @msg_prio: Message priority - * @abs_timeout: Message timeout in absolute time - * - */ -void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec *abs_timeout) -{ - struct audit_context *context = current->audit_context; - struct timespec *p = &context->mq_sendrecv.abs_timeout; - - if (abs_timeout) - memcpy(p, abs_timeout, sizeof(struct timespec)); - else - memset(p, 0, sizeof(struct timespec)); - - context->mq_sendrecv.mqdes = mqdes; - context->mq_sendrecv.msg_len = msg_len; - context->mq_sendrecv.msg_prio = msg_prio; - - context->type = AUDIT_MQ_SENDRECV; -} - -/** - * __audit_mq_notify - record audit data for a POSIX MQ notify - * @mqdes: MQ descriptor - * @notification: Notification event - * - */ - -void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) -{ - struct audit_context *context = current->audit_context; - - if (notification) - context->mq_notify.sigev_signo = notification->sigev_signo; - else - context->mq_notify.sigev_signo = 0; - - context->mq_notify.mqdes = mqdes; - context->type = AUDIT_MQ_NOTIFY; -} - -/** - * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute - * @mqdes: MQ descriptor - * @mqstat: MQ flags - * - */ -void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) -{ - struct audit_context *context = current->audit_context; - context->mq_getsetattr.mqdes = mqdes; - context->mq_getsetattr.mqstat = *mqstat; - context->type = AUDIT_MQ_GETSETATTR; -} - -/** - * audit_ipc_obj - record audit data for ipc object - * @ipcp: ipc permissions - * - */ -void __audit_ipc_obj(struct kern_ipc_perm *ipcp) -{ - struct audit_context *context = current->audit_context; - context->ipc.uid = ipcp->uid; - context->ipc.gid = ipcp->gid; - context->ipc.mode = ipcp->mode; - context->ipc.has_perm = 0; - security_ipc_getsecid(ipcp, &context->ipc.osid); - context->type = AUDIT_IPC; -} - -/** - * audit_ipc_set_perm - record audit data for new ipc permissions - * @qbytes: msgq bytes - * @uid: msgq user id - * @gid: msgq group id - * @mode: msgq mode (permissions) - * - * Called only after audit_ipc_obj(). - */ -void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) -{ - struct audit_context *context = current->audit_context; - - context->ipc.qbytes = qbytes; - context->ipc.perm_uid = uid; - context->ipc.perm_gid = gid; - context->ipc.perm_mode = mode; - context->ipc.has_perm = 1; -} - -int __audit_bprm(struct linux_binprm *bprm) -{ - struct audit_aux_data_execve *ax; - struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->argc = bprm->argc; - ax->envc = bprm->envc; - ax->mm = bprm->mm; - ax->d.type = AUDIT_EXECVE; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - - -/** - * audit_socketcall - record audit data for sys_socketcall - * @nargs: number of args - * @args: args array - * - */ -void __audit_socketcall(int nargs, unsigned long *args) -{ - struct audit_context *context = current->audit_context; - - context->type = AUDIT_SOCKETCALL; - context->socketcall.nargs = nargs; - memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); -} - -/** - * __audit_fd_pair - record audit data for pipe and socketpair - * @fd1: the first file descriptor - * @fd2: the second file descriptor - * - */ -void __audit_fd_pair(int fd1, int fd2) -{ - struct audit_context *context = current->audit_context; - context->fds[0] = fd1; - context->fds[1] = fd2; -} - -/** - * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto - * @len: data length in user space - * @a: data address in kernel space - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_sockaddr(int len, void *a) -{ - struct audit_context *context = current->audit_context; - - if (!context->sockaddr) { - void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); - if (!p) - return -ENOMEM; - context->sockaddr = p; - } - - context->sockaddr_len = len; - memcpy(context->sockaddr, a, len); - return 0; -} - -void __audit_ptrace(struct task_struct *t) -{ - struct audit_context *context = current->audit_context; - - context->target_pid = t->pid; - context->target_auid = audit_get_loginuid(t); - context->target_uid = task_uid(t); - context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &context->target_sid); - memcpy(context->target_comm, t->comm, TASK_COMM_LEN); -} - -/** - * audit_signal_info - record signal info for shutting down audit subsystem - * @sig: signal value - * @t: task being signaled - * - * If the audit subsystem is being terminated, record the task (pid) - * and uid that is doing that. - */ -int __audit_signal_info(int sig, struct task_struct *t) -{ - struct audit_aux_data_pids *axp; - struct task_struct *tsk = current; - struct audit_context *ctx = tsk->audit_context; - uid_t uid = current_uid(), t_uid = task_uid(t); - - if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = tsk->pid; - if (tsk->loginuid != -1) - audit_sig_uid = tsk->loginuid; - else - audit_sig_uid = uid; - security_task_getsecid(tsk, &audit_sig_sid); - } - if (!audit_signals || audit_dummy_context()) - return 0; - } - - /* optimize the common case by putting first signal recipient directly - * in audit_context */ - if (!ctx->target_pid) { - ctx->target_pid = t->tgid; - ctx->target_auid = audit_get_loginuid(t); - ctx->target_uid = t_uid; - ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid(t, &ctx->target_sid); - memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); - return 0; - } - - axp = (void *)ctx->aux_pids; - if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { - axp = kzalloc(sizeof(*axp), GFP_ATOMIC); - if (!axp) - return -ENOMEM; - - axp->d.type = AUDIT_OBJ_PID; - axp->d.next = ctx->aux_pids; - ctx->aux_pids = (void *)axp; - } - BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); - - axp->target_pid[axp->pid_count] = t->tgid; - axp->target_auid[axp->pid_count] = audit_get_loginuid(t); - axp->target_uid[axp->pid_count] = t_uid; - axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid(t, &axp->target_sid[axp->pid_count]); - memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); - axp->pid_count++; - - return 0; -} - -/** - * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps - * @bprm: pointer to the bprm being processed - * @new: the proposed new credentials - * @old: the old credentials - * - * Simply check if the proc already has the caps given by the file and if not - * store the priv escalation info for later auditing at the end of the syscall - * - * -Eric - */ -int __audit_log_bprm_fcaps(struct linux_binprm *bprm, - const struct cred *new, const struct cred *old) -{ - struct audit_aux_data_bprm_fcaps *ax; - struct audit_context *context = current->audit_context; - struct cpu_vfs_cap_data vcaps; - struct dentry *dentry; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->d.type = AUDIT_BPRM_FCAPS; - ax->d.next = context->aux; - context->aux = (void *)ax; - - dentry = dget(bprm->file->f_dentry); - get_vfs_caps_from_disk(dentry, &vcaps); - dput(dentry); - - ax->fcap.permitted = vcaps.permitted; - ax->fcap.inheritable = vcaps.inheritable; - ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - - ax->old_pcap.permitted = old->cap_permitted; - ax->old_pcap.inheritable = old->cap_inheritable; - ax->old_pcap.effective = old->cap_effective; - - ax->new_pcap.permitted = new->cap_permitted; - ax->new_pcap.inheritable = new->cap_inheritable; - ax->new_pcap.effective = new->cap_effective; - return 0; -} - -/** - * __audit_log_capset - store information about the arguments to the capset syscall - * @pid: target pid of the capset call - * @new: the new credentials - * @old: the old (current) credentials - * - * Record the aguments userspace sent to sys_capset for later printing by the - * audit system if applicable - */ -void __audit_log_capset(pid_t pid, - const struct cred *new, const struct cred *old) -{ - struct audit_context *context = current->audit_context; - context->capset.pid = pid; - context->capset.cap.effective = new->cap_effective; - context->capset.cap.inheritable = new->cap_effective; - context->capset.cap.permitted = new->cap_permitted; - context->type = AUDIT_CAPSET; -} - -void __audit_mmap_fd(int fd, int flags) -{ - struct audit_context *context = current->audit_context; - context->mmap.fd = fd; - context->mmap.flags = flags; - context->type = AUDIT_MMAP; -} - -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) -{ - uid_t auid, uid; - gid_t gid; - unsigned int sessionid; - - auid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - current_uid_gid(&uid, &gid); - - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " reason="); - audit_log_string(ab, reason); - audit_log_format(ab, " sig=%ld", signr); -} -/** - * audit_core_dumps - record information about processes that end abnormally - * @signr: signal value - * - * If a process ends with a core dump, something fishy is going on and we - * should record the event for investigation. - */ -void audit_core_dumps(long signr) -{ - struct audit_buffer *ab; - - if (!audit_enabled) - return; - - if (signr == SIGQUIT) /* don't care for those */ - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "memory violation", signr); - audit_log_end(ab); -} - -void __audit_seccomp(unsigned long syscall) -{ - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", SIGKILL); - audit_log_format(ab, " syscall=%ld", syscall); - audit_log_end(ab); -} - -struct list_head *audit_killed_trees(void) -{ - struct audit_context *ctx = current->audit_context; - if (likely(!ctx || !ctx->in_syscall)) - return NULL; - return &ctx->killed_trees; -} -/* - * Simple stack backtrace regression test module - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include -#include -#include -#include -#include - -static void backtrace_test_normal(void) -{ - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - dump_stack(); -} - -static DECLARE_COMPLETION(backtrace_work); - -static void backtrace_test_irq_callback(unsigned long data) -{ - dump_stack(); - complete(&backtrace_work); -} - -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); - -static void backtrace_test_irq(void) -{ - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - init_completion(&backtrace_work); - tasklet_schedule(&backtrace_tasklet); - wait_for_completion(&backtrace_work); -} - -#ifdef CONFIG_STACKTRACE -static void backtrace_test_saved(void) -{ - struct stack_trace trace; - unsigned long entries[8]; - - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); - - trace.nr_entries = 0; - trace.max_entries = ARRAY_SIZE(entries); - trace.entries = entries; - trace.skip = 0; - - save_stack_trace(&trace); - print_stack_trace(&trace, 0); -} -#else -static void backtrace_test_saved(void) -{ - printk("Saved backtrace test skipped.\n"); -} -#endif - -static int backtrace_regression_test(void) -{ - printk("====[ backtrace testing ]===========\n"); - - backtrace_test_normal(); - backtrace_test_irq(); - backtrace_test_saved(); - - printk("====[ end of backtrace testing ]====\n"); - return 0; -} - -static void exitf(void) -{ -} - -module_init(backtrace_regression_test); -module_exit(exitf); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arjan van de Ven "); -/* - * Generate definitions needed by the preprocessor. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#define __GENERATING_BOUNDS_H -/* Include headers that define the enum constants of interest */ -#include -#include -#include -#include - -void foo(void) -{ - /* The enum constants to put into include/generated/bounds.h */ - DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); - DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); - /* End of constants */ -} -/* - * linux/kernel/capability.c - * - * Copyright (C) 1997 Andrew Main - * - * Integrated into 2.1.97+, Andrew G. Morgan - * 30 May 2002: Cleanup, Robert M. Love - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Leveraged for setting/resetting capabilities - */ - -const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - -EXPORT_SYMBOL(__cap_empty_set); - -int file_caps_enabled = 1; - -static int __init file_caps_disable(char *str) -{ - file_caps_enabled = 0; - return 1; -} -__setup("no_file_caps", file_caps_disable); - -/* - * More recent versions of libcap are available from: - * - * http://www.kernel.org/pub/linux/libs/security/linux-privs/ - */ - -static void warn_legacy_capability_use(void) -{ - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version 2 capabilities worked fine, but the linux/capability.h file - * that accompanied their introduction encouraged their use without - * the necessary user-space source code changes. As such, we have - * created a version 3 with equivalent functionality to version 2, but - * with a header change to protect legacy source code from using - * version 2 when it wanted to use version 1. If your system has code - * that trips the following warning, it is using version 2 specific - * capabilities and may be doing so insecurely. - * - * The remedy is to either upgrade your version of libcap (to 2.10+, - * if the application is linked against it), or recompile your - * application with modern kernel headers and this warning will go - * away. - */ - -static void warn_deprecated_v2(void) -{ - static int warned; - - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } -} - -/* - * Version check. Return the number of u32s in each capability flag - * array, or a negative value on error. - */ -static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) -{ - __u32 version; - - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - *tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ - case _LINUX_CAPABILITY_VERSION_3: - *tocopy = _LINUX_CAPABILITY_U32S_3; - break; - default: - if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } - - return 0; -} - -/* - * The only thing that can change the capabilities of the current - * process is the current process. As such, we can't be in this code - * at the same time as we are in the process of setting capabilities - * in this process. The net result is that we can limit our use of - * locks to when we are reading the caps of another process. - */ -static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, - kernel_cap_t *pIp, kernel_cap_t *pPp) -{ - int ret; - - if (pid && (pid != task_pid_vnr(current))) { - struct task_struct *target; - - rcu_read_lock(); - - target = find_task_by_vpid(pid); - if (!target) - ret = -ESRCH; - else - ret = security_capget(target, pEp, pIp, pPp); - - rcu_read_unlock(); - } else - ret = security_capget(current, pEp, pIp, pPp); - - return ret; -} - -/** - * sys_capget - get the capabilities of a given process. - * @header: pointer to struct that contains capability version and - * target pid data - * @dataptr: pointer to struct that contains the effective, permitted, - * and inheritable capabilities that are returned - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) -{ - int ret = 0; - pid_t pid; - unsigned tocopy; - kernel_cap_t pE, pI, pP; - - ret = cap_validate_magic(header, &tocopy); - if ((dataptr == NULL) || (ret != 0)) - return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - if (pid < 0) - return -EINVAL; - - ret = cap_get_target_pid(pid, &pE, &pI, &pP); - if (!ret) { - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i; - - for (i = 0; i < tocopy; i++) { - kdata[i].effective = pE.cap[i]; - kdata[i].permitted = pP.cap[i]; - kdata[i].inheritable = pI.cap[i]; - } - - /* - * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, - * we silently drop the upper capabilities here. This - * has the effect of making older libcap - * implementations implicitly drop upper capability - * bits when they perform a: capget/modify/capset - * sequence. - * - * This behavior is considered fail-safe - * behavior. Upgrading the application to a newer - * version of libcap will enable access to the newer - * capabilities. - * - * An alternative would be to return an error here - * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts - * before modification is attempted and the application - * fails. - */ - if (copy_to_user(dataptr, kdata, tocopy - * sizeof(struct __user_cap_data_struct))) { - return -EFAULT; - } - } - - return ret; -} - -/** - * sys_capset - set capabilities for a process or (*) a group of processes - * @header: pointer to struct that contains capability version and - * target pid data - * @data: pointer to struct that contains the effective, permitted, - * and inheritable capabilities - * - * Set capabilities for the current process only. The ability to any other - * process(es) has been deprecated and removed. - * - * The restrictions on setting capabilities are specified as: - * - * I: any raised capabilities must be a subset of the old permitted - * P: any raised capabilities must be a subset of the old permitted - * E: must be set to a subset of new permitted - * - * Returns 0 on success and < 0 on error. - */ -SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) -{ - struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy, copybytes; - kernel_cap_t inheritable, permitted, effective; - struct cred *new; - int ret; - pid_t pid; - - ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; - - if (get_user(pid, &header->pid)) - return -EFAULT; - - /* may only affect current now */ - if (pid != 0 && pid != task_pid_vnr(current)) - return -EPERM; - - copybytes = tocopy * sizeof(struct __user_cap_data_struct); - if (copybytes > sizeof(kdata)) - return -EFAULT; - - if (copy_from_user(&kdata, data, copybytes)) - return -EFAULT; - - for (i = 0; i < tocopy; i++) { - effective.cap[i] = kdata[i].effective; - permitted.cap[i] = kdata[i].permitted; - inheritable.cap[i] = kdata[i].inheritable; - } - while (i < _KERNEL_CAPABILITY_U32S) { - effective.cap[i] = 0; - permitted.cap[i] = 0; - inheritable.cap[i] = 0; - i++; - } - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = security_capset(new, current_cred(), - &effective, &inheritable, &permitted); - if (ret < 0) - goto error; - - audit_log_capset(pid, new, current_cred()); - - return commit_creds(new); - -error: - abort_creds(new); - return ret; -} - -/** - * has_ns_capability - Does a task have a capability in a specific user ns - * @t: The task in question - * @ns: target user namespace - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the specified user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_ns_capability(struct task_struct *t, - struct user_namespace *ns, int cap) -{ - int ret; - - rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); - rcu_read_unlock(); - - return (ret == 0); -} - -/** - * has_capability - Does a task have a capability in init_user_ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability(struct task_struct *t, int cap) -{ - return has_ns_capability(t, &init_user_ns, cap); -} - -/** - * has_ns_capability_noaudit - Does a task have a capability (unaudited) - * in a specific user ns. - * @t: The task in question - * @ns: target user namespace - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the specified user namespace, false if not. - * Do not write an audit message for the check. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_ns_capability_noaudit(struct task_struct *t, - struct user_namespace *ns, int cap) -{ - int ret; - - rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); - rcu_read_unlock(); - - return (ret == 0); -} - -/** - * has_capability_noaudit - Does a task have a capability (unaudited) in the - * initial user ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to init_user_ns, false if not. Don't write an - * audit message for the check. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability_noaudit(struct task_struct *t, int cap) -{ - return has_ns_capability_noaudit(t, &init_user_ns, cap); -} - -/** - * ns_capable - Determine if the current task has a superior capability in effect - * @ns: The usernamespace we want the capability in - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool ns_capable(struct user_namespace *ns, int cap) -{ - if (unlikely(!cap_valid(cap))) { - printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; -} -EXPORT_SYMBOL(ns_capable); - -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - -/** - * nsown_capable - Check superior capability to one's own user_ns - * @cap: The capability in question - * - * Return true if the current task has the given superior capability - * targeted at its own user namespace. - */ -bool nsown_capable(int cap) -{ - return ns_capable(current_user_ns(), cap); -} -/* - * Generic process-grouping system. - * - * Based originally on the cpuset system, extracted by Paul Menage - * Copyright (C) 2006 Google, Inc - * - * Notifications support - * Copyright (C) 2009 Nokia Corporation - * Author: Kirill A. Shutemov - * - * Copyright notices from the original cpuset code: - * -------------------------------------------------- - * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004-2006 Silicon Graphics, Inc. - * - * Portions derived from Patrick Mochel's sysfs code. - * sysfs is Copyright (c) 2001-3 Patrick Mochel - * - * 2003-10-10 Written by Simon Derr. - * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson. - * --------------------------------------------------- - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of the Linux - * distribution for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* TODO: replace with more sophisticated array */ -#include -#include -#include /* used in cgroup_attach_proc */ - -#include - -/* - * cgroup_mutex is the master lock. Any modification to cgroup or its - * hierarchy must be performed while holding it. - * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. - * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. - */ -static DEFINE_MUTEX(cgroup_mutex); -static DEFINE_MUTEX(cgroup_root_mutex); - -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ -#define SUBSYS(_x) &_x ## _subsys, -static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { -#include -}; - -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_bits; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_bits; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - -/* - * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the - * subsystems that are otherwise unattached - it never has more than a - * single cgroup, and all tasks are part of that cgroup. - */ -static struct cgroupfs_root rootnode; - -/* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX (65535) -struct css_id { - /* - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. - */ - struct cgroup_subsys_state __rcu *css; - /* - * ID of this css. - */ - unsigned short id; - /* - * Depth in hierarchy which this ID belongs to. - */ - unsigned short depth; - /* - * ID is freed by RCU. (and lookup routine is RCU safe.) - */ - struct rcu_head rcu_head; - /* - * Hierarchy of CSS ID belongs to. - */ - unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * Cgroup which the event belongs to. - */ - struct cgroup *cgrp; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - -/* The list of hierarchy roots */ - -static LIST_HEAD(roots); -static int root_count; - -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); - -/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ -#define dummytop (&rootnode.top_cgroup) - -/* This flag indicates whether tasks in the fork and exit paths should - * check for fork/exit handlers to call. This avoids us having to do - * extra work in the fork/exit path if none of the subsystems need to - * be called. - */ -static int need_forkexit_callback __read_mostly; - -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - -/* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) -{ - return test_bit(CGRP_REMOVED, &cgrp->flags); -} - -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ -}; - -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ - const int bits = - (1 << CGRP_RELEASABLE) | - (1 << CGRP_NOTIFY_ON_RELEASE); - return (cgrp->flags & bits) == bits; -} - -static int notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - -/* - * for_each_subsys() allows you to iterate on each subsystem attached to - * an active hierarchy - */ -#define for_each_subsys(_root, _ss) \ -list_for_each_entry(_ss, &_root->subsys_list, sibling) - -/* for_each_active_root() allows you to iterate across the active hierarchies */ -#define for_each_active_root(_root) \ -list_for_each_entry(_root, &roots, root_list) - -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_RAW_SPINLOCK(release_list_lock); -static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent); -static void check_for_release(struct cgroup *cgrp); - -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { - /* - * List running through cg_cgroup_links associated with a - * cgroup, anchored on cgroup->css_sets - */ - struct list_head cgrp_link_list; - struct cgroup *cgrp; - /* - * List running through cg_cgroup_links pointing at a - * single css_set object, anchored on css_set->cg_links - */ - struct list_head cg_link_list; - struct css_set *cg; -}; - -/* The default css_set - used by init and its children prior to any - * hierarchies being mounted. It contains a pointer to the root state - * for each subsystem. Also used to anchor the list of css_sets. Not - * reference-counted, to improve performance when child cgroups - * haven't been created. - */ - -static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; - -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ -static DEFINE_RWLOCK(css_set_lock); -static int css_set_count; - -/* - * hash table for cgroup groups. This improves the performance to find - * an existing css_set. This hash doesn't (currently) take into - * account cgroups in empty hierarchies. - */ -#define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; - -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) -{ - int i; - int index; - unsigned long tmp = 0UL; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; - - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; -} - -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cg->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cg->refcount)) { - write_unlock(&css_set_lock); - return; - } - - /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - struct cgroup *cgrp = link->cgrp; - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - - kfree(link); - } - - write_unlock(&css_set_lock); - kfree_rcu(cg, rcu_head); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cg) -{ - atomic_inc(&cg->refcount); -} - -static inline void put_css_set(struct css_set *cg) -{ - __put_css_set(cg, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cg) -{ - __put_css_set(cg, 1); -} - -/* - * compare_css_sets - helper function for find_existing_css_set(). - * @cg: candidate css_set being tested - * @old_cg: existing css_set for a task - * @new_cgrp: cgroup that's being entered by the task - * @template: desired set of css pointers in css_set (pre-calculated) - * - * Returns true if "cg" matches "old_cg" except for the hierarchy - * which "new_cgrp" belongs to, for which it should match "new_cgrp". - */ -static bool compare_css_sets(struct css_set *cg, - struct css_set *old_cg, - struct cgroup *new_cgrp, - struct cgroup_subsys_state *template[]) -{ - struct list_head *l1, *l2; - - if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* Not all subsystems matched */ - return false; - } - - /* - * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. - */ - - l1 = &cg->cg_links; - l2 = &old_cg->cg_links; - while (1) { - struct cg_cgroup_link *cgl1, *cgl2; - struct cgroup *cg1, *cg2; - - l1 = l1->next; - l2 = l2->next; - /* See if we reached the end - both lists are equal length. */ - if (l1 == &cg->cg_links) { - BUG_ON(l2 != &old_cg->cg_links); - break; - } else { - BUG_ON(l2 == &old_cg->cg_links); - } - /* Locate the cgroups associated with these links. */ - cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); - cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cg1 = cgl1->cgrp; - cg2 = cgl2->cgrp; - /* Hierarchies should be linked in the same order. */ - BUG_ON(cg1->root != cg2->root); - - /* - * If this hierarchy is the hierarchy of the cgroup - * that's changing, then we need to check that this - * css_set points to the new cgroup; if it's any other - * hierarchy, then this css_set should point to the - * same cgroup as the old css_set. - */ - if (cg1->root == new_cgrp->root) { - if (cg1 != new_cgrp) - return false; - } else { - if (cg1 != cg2) - return false; - } - } - return true; -} - -/* - * find_existing_css_set() is a helper for - * find_css_set(), and checks to see whether an existing - * css_set is suitable. - * - * oldcg: the cgroup group that we're using before the cgroup - * transition - * - * cgrp: the cgroup that we're moving into - * - * template: location in which to build the desired set of subsystem - * state objects for the new cgroup group - */ -static struct css_set *find_existing_css_set( - struct css_set *oldcg, - struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) -{ - int i; - struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; - struct hlist_node *node; - struct css_set *cg; - - /* - * Build the set of subsystem state objects that we want to see in the - * new css_set. while subsystems can change globally, the entries here - * won't change, so no need for locking. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgrp->subsys[i]; - } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ - template[i] = oldcg->subsys[i]; - } - } - - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { - if (!compare_css_sets(cg, oldcg, cgrp, template)) - continue; - - /* This css_set matches what we need */ - return cg; - } - - /* No existing cgroup group matched */ - return NULL; -} - -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error - */ -static int allocate_cg_links(int count, struct list_head *tmp) -{ - struct cg_cgroup_link *link; - int i; - INIT_LIST_HEAD(tmp); - for (i = 0; i < count; i++) { - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) { - free_cg_links(tmp); - return -ENOMEM; - } - list_add(&link->cgrp_link_list, tmp); - } - return 0; -} - -/** - * link_css_set - a helper function to link a css_set to a cgroup - * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() - * @cg: the css_set to be linked - * @cgrp: the destination cgroup - */ -static void link_css_set(struct list_head *tmp_cg_links, - struct css_set *cg, struct cgroup *cgrp) -{ - struct cg_cgroup_link *link; - - BUG_ON(list_empty(tmp_cg_links)); - link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, - cgrp_link_list); - link->cg = cg; - link->cgrp = cgrp; - atomic_inc(&cgrp->count); - list_move(&link->cgrp_link_list, &cgrp->css_sets); - /* - * Always add links to the tail of the list so that the list - * is sorted by order of hierarchy creation - */ - list_add_tail(&link->cg_link_list, &cg->cg_links); -} - -/* - * find_css_set() takes an existing cgroup group and a - * cgroup object, and returns a css_set object that's - * equivalent to the old group, but with the given cgroup - * substituted into the appropriate hierarchy. Must be called with - * cgroup_mutex held - */ -static struct css_set *find_css_set( - struct css_set *oldcg, struct cgroup *cgrp) -{ - struct css_set *res; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - struct list_head tmp_cg_links; - - struct hlist_head *hhead; - struct cg_cgroup_link *link; - - /* First see if we already have a cgroup group that matches - * the desired set */ - read_lock(&css_set_lock); - res = find_existing_css_set(oldcg, cgrp, template); - if (res) - get_css_set(res); - read_unlock(&css_set_lock); - - if (res) - return res; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return NULL; - - /* Allocate all the cg_cgroup_link objects that we'll need */ - if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { - kfree(res); - return NULL; - } - - atomic_set(&res->refcount, 1); - INIT_LIST_HEAD(&res->cg_links); - INIT_LIST_HEAD(&res->tasks); - INIT_HLIST_NODE(&res->hlist); - - /* Copy the set of subsystem state objects generated in - * find_existing_css_set() */ - memcpy(res->subsys, template, sizeof(res->subsys)); - - write_lock(&css_set_lock); - /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - if (c->root == cgrp->root) - c = cgrp; - link_css_set(&tmp_cg_links, res, c); - } - - BUG_ON(!list_empty(&tmp_cg_links)); - - css_set_count++; - - /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); - - write_unlock(&css_set_lock); - - return res; -} - -/* - * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. - */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) -{ - struct css_set *css; - struct cgroup *res = NULL; - - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); - /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. - */ - css = task->cgroups; - if (css == &init_css_set) { - res = &root->top_cgroup; - } else { - struct cg_cgroup_link *link; - list_for_each_entry(link, &css->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - if (c->root == root) { - res = c; - break; - } - } - } - read_unlock(&css_set_lock); - BUG_ON(!res); - return res; -} - -/* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * - * A task must hold cgroup_mutex to modify cgroups. - * - * Any task can increment and decrement the count field without lock. - * So in general, code holding cgroup_mutex can't rely on the count - * field not changing. However, if the count goes to zero, then only - * cgroup_attach_task() can increment it again. Because a count of zero - * means that no tasks are currently attached, therefore there is no - * way a task attached to that cgroup can fork (the other way to - * increment the count). So code holding cgroup_mutex can safely - * assume that if the count is zero, it will stay zero. Similarly, if - * a task holds cgroup_mutex on a cgroup with zero count, it - * knows that the cgroup won't be removed, as cgroup_rmdir() - * needs that mutex. - * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits. Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - * - * A cgroup can only be deleted if both its 'count' of using tasks - * is zero, and its list of 'children' cgroups is empty. Since all - * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup - * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. - * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. - * - * P.S. One more locking exception. RCU is used to guard the - * update of a tasks cgroup pointer by cgroup_attach_task() - */ - -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); - -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); -static const struct inode_operations cgroup_dir_inode_operations; -static const struct file_operations proc_cgroupstats_operations; - -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; -} - -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); - if (ret) - break; - } - - return ret; -} - -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); - - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); - - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); - - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); - - kfree_rcu(cgrp, rcu_head); - } - iput(inode); -} - -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); -} - -static void cgroup_clear_directory(struct dentry *dentry) -{ - struct list_head *node; - - BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dentry->d_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(node); - if (d->d_inode) { - /* This should never be called on a cgroup - * directory with child cgroups */ - BUG_ON(d->d_inode->i_mode & S_IFDIR); - dget_dlock(d); - spin_unlock(&d->d_lock); - spin_unlock(&dentry->d_lock); - d_delete(d); - simple_unlink(dentry->d_inode, d); - dput(d); - spin_lock(&dentry->d_lock); - } else - spin_unlock(&d->d_lock); - node = dentry->d_subdirs.next; - } - spin_unlock(&dentry->d_lock); -} - -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - cgroup_clear_directory(dentry); - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - -/* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_bits) -{ - unsigned long added_bits, removed_bits; - struct cgroup *cgrp = &root->top_cgroup; - int i; - - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - - removed_bits = root->actual_subsys_bits & ~final_bits; - added_bits = final_bits & ~root->actual_subsys_bits; - /* Check that any added subsystems are currently free */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) - continue; - /* - * Nobody should tell us to do a subsys that doesn't exist: - * parse_cgroupfs_options should catch that case and refcounts - * ensure that subsystems won't disappear once selected. - */ - BUG_ON(ss == NULL); - if (ss->root != &rootnode) { - /* Subsystem isn't free */ - return -EBUSY; - } - } - - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; - - /* Process each subsystem */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - unsigned long bit = 1UL << i; - if (bit & added_bits) { - /* We're binding this subsystem to this hierarchy */ - BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i]); - BUG_ON(!dummytop->subsys[i]); - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - mutex_lock(&ss->hierarchy_mutex); - cgrp->subsys[i] = dummytop->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; - list_move(&ss->sibling, &root->subsys_list); - ss->root = root; - if (ss->bind) - ss->bind(ss, cgrp); - mutex_unlock(&ss->hierarchy_mutex); - /* refcount was already taken, and we're keeping it */ - } else if (bit & removed_bits) { - /* We're removing this subsystem */ - BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - mutex_lock(&ss->hierarchy_mutex); - if (ss->bind) - ss->bind(ss, dummytop); - dummytop->subsys[i]->cgroup = dummytop; - cgrp->subsys[i] = NULL; - subsys[i]->root = &rootnode; - list_move(&ss->sibling, &rootnode.subsys_list); - mutex_unlock(&ss->hierarchy_mutex); - /* subsystem is now free - drop reference on module */ - module_put(ss->module); - } else if (bit & final_bits) { - /* Subsystem state should already exist */ - BUG_ON(ss == NULL); - BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); - } - } - root->subsys_bits = root->actual_subsys_bits = final_bits; - synchronize_rcu(); - - return 0; -} - -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - struct cgroup_subsys *ss; - - mutex_lock(&cgroup_root_mutex); - for_each_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) - seq_puts(seq, ",noprefix"); - if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (clone_children(&root->top_cgroup)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); - return 0; -} - -struct cgroup_sb_opts { - unsigned long subsys_bits; - unsigned long flags; - char *release_agent; - bool clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; - - struct cgroupfs_root *new_root; - -}; - -/* - * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. This function takes - * refcounts on subsystems to be used, unless it returns error, in which case - * no refcounts are taken. - */ -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; - int i; - bool module_pin_failed = false; - - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - -#ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); - continue; - } - if (!strcmp(token, "clone_children")) { - opts->clone_children = true; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (strcmp(token, ss->name)) - continue; - if (ss->disabled) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - set_bit(i, &opts->subsys_bits); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options - * were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->disabled) - continue; - set_bit(i, &opts->subsys_bits); - } - } - - /* Consistency checks */ - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_bits & mask)) - return -EINVAL; - - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_bits && opts->none) - return -EINVAL; - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_bits && !opts->name) - return -EINVAL; - - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_bits)) - continue; - if (!try_module_get(subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_bits)) - continue; - module_put(subsys[i]->module); - } - return -ENOENT; - } - - return 0; -} - -static void drop_parsed_module_refcounts(unsigned long subsys_bits) -{ - int i; - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - - if (!(bit & subsys_bits)) - continue; - module_put(subsys[i]->module); - } -} - -static int cgroup_remount(struct super_block *sb, int *flags, char *data) -{ - int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgroup_sb_opts opts; - - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* Don't allow flags or name to change at remount */ - if (opts.flags != root->flags || - (opts.name && strcmp(opts.name, root->name))) { - ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_bits); - goto out_unlock; - } - - ret = rebind_subsystems(root, opts.subsys_bits); - if (ret) { - drop_parsed_module_refcounts(opts.subsys_bits); - goto out_unlock; - } - - /* (re)populate subsystem files */ - cgroup_populate_dir(cgrp); - - if (opts.release_agent) - strcpy(root->release_agent_path, opts.release_agent); - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - return ret; -} - -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; - -static void init_cgroup_housekeeping(struct cgroup *cgrp) -{ - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pidlists); - mutex_init(&cgrp->pidlist_mutex); - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); -} - -static void init_cgroup_root(struct cgroupfs_root *root) -{ - struct cgroup *cgrp = &root->top_cgroup; - INIT_LIST_HEAD(&root->subsys_list); - INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; - cgrp->root = root; - cgrp->top_cgroup = cgrp; - init_cgroup_housekeeping(cgrp); -} - -static bool init_root_id(struct cgroupfs_root *root) -{ - int ret = 0; - - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; - spin_lock(&hierarchy_id_lock); - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - spin_unlock(&hierarchy_id_lock); - } while (ret); - return true; -} - -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_bits || opts->none) - && (opts->subsys_bits != root->subsys_bits)) - return 0; - - return 1; -} - -static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) -{ - struct cgroupfs_root *root; - - if (!opts->subsys_bits && !opts->none) - return NULL; - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - return ERR_PTR(-ENOMEM); - - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - init_cgroup_root(root); - - root->subsys_bits = opts->subsys_bits; - root->flags = opts->flags; - if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); - if (opts->name) - strcpy(root->name, opts->name); - if (opts->clone_children) - set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); - return root; -} - -static void cgroup_drop_root(struct cgroupfs_root *root) -{ - if (!root) - return; - - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - kfree(root); -} - -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; - - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; - - BUG_ON(!opts->subsys_bits && !opts->none); - - ret = set_anon_super(sb, NULL); - if (ret) - return ret; - - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; - - return 0; -} - -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = cgroup_delete, - }; - - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; - - if (!inode) - return -ENOMEM; - - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); - return -ENOMEM; - } - sb->s_root = dentry; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; -} - -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) -{ - struct cgroup_sb_opts opts; - struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *new_root; - struct inode *inode; - - /* First find the desired set of subsystems */ - mutex_lock(&cgroup_mutex); - ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); - if (ret) - goto out_err; - - /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. - */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto drop_modules; - } - opts.new_root = new_root; - - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); - goto drop_modules; - } - - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_cg_links; - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - const struct cred *cred; - int i; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cg_links(css_set_count, &tmp_cg_links); - if (ret) - goto unlock_drop; - - ret = rebind_subsystems(root, root->subsys_bits); - if (ret == -EBUSY) { - free_cg_links(&tmp_cg_links); - goto unlock_drop; - } - /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. - */ - - /* EBUSY should be the only error here */ - BUG_ON(ret); - - list_add(&root->root_list, &roots); - root_count++; - - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } - write_unlock(&css_set_lock); - - free_cg_links(&tmp_cg_links); - - BUG_ON(!list_empty(&root_cgrp->sibling)); - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp); - revert_creds(cred); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - } else { - /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed - */ - cgroup_drop_root(opts.new_root); - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_bits); - } - - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); - - unlock_drop: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_bits); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); -} - -static void cgroup_kill_sb(struct super_block *sb) { - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - int ret; - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - - /* - * Release all the links from css_sets to this hierarchy's - * root cgroup - */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, - cgrp_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } - write_unlock(&css_set_lock); - - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - - kill_litter_super(sb); - cgroup_drop_root(root); -} - -static struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, -}; - -static struct kobject *cgroup_kobj; - -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -/** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - char *start; - struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); - - if (!dentry || cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); - return 0; - } - - start = buf + buflen; - - *--start = '\0'; - for (;;) { - int len = dentry->d_name.len; - - if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; - - dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); - if (!cgrp->parent) - continue; - if (--start < buf) - return -ENAMETOOLONG; - *start = '/'; - } - memmove(buf, start, buf + buflen - start); - return 0; -} -EXPORT_SYMBOL_GPL(cgroup_path); - -/* - * Control Group taskset - */ -struct task_and_cgroup { - struct task_struct *task; - struct cgroup *cgrp; -}; - -struct cgroup_taskset { - struct task_and_cgroup single; - struct flex_array *tc_array; - int tc_array_len; - int idx; - struct cgroup *cur_cgrp; -}; - -/** - * cgroup_taskset_first - reset taskset and return the first task - * @tset: taskset of interest - * - * @tset iteration is initialized and the first task is returned. - */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) -{ - if (tset->tc_array) { - tset->idx = 0; - return cgroup_taskset_next(tset); - } else { - tset->cur_cgrp = tset->single.cgrp; - return tset->single.task; - } -} -EXPORT_SYMBOL_GPL(cgroup_taskset_first); - -/** - * cgroup_taskset_next - iterate to the next task in taskset - * @tset: taskset of interest - * - * Return the next task in @tset. Iteration must have been initialized - * with cgroup_taskset_first(). - */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) -{ - struct task_and_cgroup *tc; - - if (!tset->tc_array || tset->idx >= tset->tc_array_len) - return NULL; - - tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; - return tc->task; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_next); - -/** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task - * @tset: taskset of interest - * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). - */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) -{ - return tset->cur_cgrp; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); - -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - - -/* - * cgroup_task_migrate - move a task from one cgroup to another. - * - * 'guarantee' is set if the caller promises that a new css_set for the task - * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. - */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) -{ - struct css_set *oldcg; - struct css_set *newcg; - - /* - * We are synchronized through threadgroup_lock() against PF_EXITING - * setting such that we can't race against cgroup_exit() changing the - * css_set to init_css_set and dropping the old one. - */ - WARN_ON_ONCE(tsk->flags & PF_EXITING); - oldcg = tsk->cgroups; - - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - - task_lock(tsk); - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); - - /* - * We just gained a reference on oldcg by taking it from the task. As - * trading it for newcg is protected by cgroup_mutex, we're safe to drop - * it here; it will be freed under RCU. - */ - put_css_set(oldcg); - - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; -} - -/** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) - goto out; - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(ss, cgrp, &tset); - } - - synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); - } - } - return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup - * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached - * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. - */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) -{ - int retval, i, group_size; - struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; - struct cgroupfs_root *root = cgrp->root; - /* threadgroup list cursor and array */ - struct task_struct *tsk; - struct task_and_cgroup *tc; - struct flex_array *group; - struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; - - /* - * step 0: in order to do expensive, possibly blocking operations for - * every thread, we cannot iterate the thread group list, since it needs - * rcu or tasklist locked. instead, build an array of all threads in the - * group - group_rwsem prevents new threads from appearing, and if - * threads exit, this will just be an over-estimate. - */ - group_size = get_nr_threads(leader); - /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); - if (!group) - return -ENOMEM; - /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); - if (retval) - goto out_free_group_list; - - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - - tsk = leader; - i = 0; - do { - struct task_and_cgroup ent; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - continue; - - /* as per above, nr_threads may decrease, but not increase. */ - BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); - /* nothing to do if this task is already in the cgroup */ - if (ent.cgrp == cgrp) - continue; - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); - i++; - } while_each_thread(leader, tsk); - /* remember the number of threads in the array for later. */ - group_size = i; - tset.tc_array = group; - tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); - - /* methods shouldn't be called if no task is actually migrating */ - retval = 0; - if (!group_size) - goto out_free_group_list; - - /* - * step 1: check that we can legitimately attach to the cgroup. - */ - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); - if (retval) { - failed_ss = ss; - goto out_cancel_attach; - } - } - } - - /* - * step 2: make sure css_sets exist for all threads to be migrated. - * we use find_css_set, which allocates a new one if necessary. - */ - INIT_LIST_HEAD(&newcg_list); - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; - } - } - - /* - * step 3: now that we're guaranteed success wrt the css_sets, - * proceed to move all tasks to the new cgroup. There are no - * failure cases after here, so this is the commit point. - */ - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); - } - /* nothing is sensitive to fork() after this point. */ - - /* - * step 4: do subsystem attach callbacks. - */ - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(ss, cgrp, &tset); - } - - /* - * step 5: success! and cleanup - */ - synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); - retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); - } -out_cancel_attach: - /* same deal as in cgroup_attach_task */ - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - break; - if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); - } - } -out_free_group_list: - flex_array_free(group); - return retval; -} - -/* - * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup; may take task_lock of task. - */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) -{ - struct task_struct *tsk; - const struct cred *cred = current_cred(), *tcred; - int ret; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - if (pid) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; - } - /* - * even if we're attaching all tasks in the thread group, we - * only need to check permissions on one of them. - */ - tcred = __task_cred(tsk); - if (cred->euid && - cred->euid != tcred->uid && - cred->euid != tcred->suid) { - rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; - } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); - - if (threadgroup) - ret = cgroup_attach_proc(cgrp, tsk); - else - ret = cgroup_attach_task(cgrp, tsk); - - threadgroup_unlock(tsk); - - put_task_struct(tsk); - cgroup_unlock(); - return ret; -} - -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) -{ - return attach_task_by_pid(cgrp, pid, false); -} - -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) -{ - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; -} - -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); - -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) -{ - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); - mutex_unlock(&cgroup_root_mutex); - cgroup_unlock(); - return 0; -} - -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) -{ - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - seq_puts(seq, cgrp->root->release_agent_path); - seq_putc(seq, '\n'); - cgroup_unlock(); - return 0; -} - -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - if (cft->write_u64) { - u64 val = simple_strtoull(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); - } else { - s64 val = simple_strtoll(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); - } - if (!retval) - retval = nbytes; - return retval; -} - -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; - - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; - } - - buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; -} - -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (cgroup_is_removed(cgrp)) - return -ENODEV; - if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); - return ret ? ret : nbytes; - } - return -EINVAL; -} - -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} - -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (cgroup_is_removed(cgrp)) - return -ENODEV; - - if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); - return -EINVAL; -} - -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) -{ - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); -} - -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(state->cgroup, cft, &cb); - } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); -} - -static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = cgroup_seqfile_release, -}; - -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - int err; - struct cftype *cft; - - err = generic_file_open(inode, file); - if (err) - return err; - cft = __d_cft(file->f_dentry); - - if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state = - kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) - err = cft->open(inode, file); - else - err = 0; - - return err; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cftype *cft = __d_cft(file->f_dentry); - if (cft->release) - return cft->release(inode, file); - return 0; -} - -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) - return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) - return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); -} - -static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, -}; - -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; -} - -/* - * Check if a file is a control file - */ -static inline struct cftype *__file_cft(struct file *file) -{ - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} - -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - umode_t mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); - } - dput(dentry); - - return error; -} - -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) -{ - umode_t mode = 0; - - if (cft->mode) - return cft->mode; - - if (cft->read || cft->read_u64 || cft->read_s64 || - cft->read_map || cft->read_seq_string) - mode |= S_IRUGO; - - if (cft->write || cft->write_u64 || cft->write_s64 || - cft->write_string || cft->trigger) - mode |= S_IWUSR; - - return mode; -} - -int cgroup_add_file(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype *cft) -{ - struct dentry *dir = cgrp->dentry; - struct dentry *dentry; - int error; - umode_t mode; - - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { - strcpy(name, subsys->name); - strcat(name, "."); - } - strcat(name, cft->name); - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - dentry = lookup_one_len(name, dir, strlen(name)); - if (!IS_ERR(dentry)) { - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, - cgrp->root->sb); - if (!error) - dentry->d_fsdata = (void *)cft; - dput(dentry); - } else - error = PTR_ERR(dentry); - return error; -} -EXPORT_SYMBOL_GPL(cgroup_add_file); - -int cgroup_add_files(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype cft[], - int count) -{ - int i, err; - for (i = 0; i < count; i++) { - err = cgroup_add_file(cgrp, subsys, &cft[i]); - if (err) - return err; - } - return 0; -} -EXPORT_SYMBOL_GPL(cgroup_add_files); - -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cg_cgroup_link *link; - - read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->refcount); - } - read_unlock(&css_set_lock); - return count; -} - -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) -{ - struct list_head *l = it->cg_link; - struct cg_cgroup_link *link; - struct css_set *cg; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->css_sets) { - it->cg_link = NULL; - return; - } - link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cg = link->cg; - } while (list_empty(&cg->tasks)); - it->cg_link = l; - it->task = cg->tasks.next; -} - -/* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - */ - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &p->cgroups->tasks); - task_unlock(p); - } while_each_thread(g, p); - write_unlock(&css_set_lock); -} - -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) - __acquires(css_set_lock) -{ - /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - - read_lock(&css_set_lock); - it->cg_link = &cgrp->css_sets; - cgroup_advance_iter(cgrp, it); -} - -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) -{ - struct task_struct *res; - struct list_head *l = it->task; - struct cg_cgroup_link *link; - - /* If the iterator cg is NULL, we have no tasks */ - if (!it->cg_link) - return NULL; - res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ - l = l->next; - link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); - if (l == &link->cg->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); - } else { - it->task = l; - } - return res; -} - -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) - __releases(css_set_lock) -{ - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan - * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. - * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). - */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) -{ - int retval, i; - struct cgroup_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct ptr_heap *heap; - struct timespec latest_time = { 0, 0 }; - - if (scan->heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (scan->test_task && !scan->test_task(p, scan)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - cgroup_iter_end(scan->cg, &it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - scan->process_task(q, scan); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} -static void pidlist_free(void *p) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); -} -static void *pidlist_resize(void *p, int newcount) -{ - void *newlist; - /* note: if new alloc fails, old p will still be valid either way */ - if (is_vmalloc_addr(p)) { - newlist = vmalloc(newcount * sizeof(pid_t)); - if (!newlist) - return NULL; - memcpy(newlist, p, newcount * sizeof(pid_t)); - vfree(p); - } else { - newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); - } - return newlist; -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. - */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) -{ - int src, dest = 1; - pid_t *list = *p; - pid_t *newlist; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - /* - * if the length difference is large enough, we want to allocate a - * smaller buffer to save memory. if this fails due to out of memory, - * we'll just stay with what we've got. - */ - if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = pidlist_resize(list, dest); - if (newlist) - *p = newlist; - } - return dest; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; - - /* - * We can't drop the pidlist_mutex before taking the l->mutex in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - } - /* entry not found; create a new one */ - l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - init_rwsem(&l->mutex); - down_write(&l->mutex); - l->key.type = type; - l->key.ns = get_pid_ns(ns); - l->use_count = 0; /* don't increment here */ - l->list = NULL; - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - cgroup_iter_end(cgrp, &it); - length = n; - /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(&array, length); - l = cgroup_pidlist_find(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; - } - /* store array, freeing old if necessary - lock already held */ - pidlist_free(l->list); - l->list = array; - l->length = length; - l->use_count++; - up_write(&l->mutex); - *lp = l; - return 0; -} - -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) -{ - int ret = -EINVAL; - struct cgroup *cgrp; - struct cgroup_iter it; - struct task_struct *tsk; - - /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. - */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; - - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - cgroup_iter_end(cgrp, &it); - -err: - return ret; -} - - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) -{ - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ - struct cgroup_pidlist *l = s->private; - int index = 0, pid = *pos; - int *iter; - - down_read(&l->mutex); - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (l->list[mid] == pid) { - index = mid; - break; - } else if (l->list[mid] <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = *iter; - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); -} - -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct cgroup_pidlist *l = s->private; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - return NULL; - } else { - *pos = *p; - return p; - } -} - -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - return seq_printf(s, "%d\n", *(int *)v); -} - -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - -static void cgroup_release_pid_array(struct cgroup_pidlist *l) -{ - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->mutex); - kfree(l); - return; - } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); -} - -static int cgroup_pidlist_release(struct inode *inode, struct file *file) -{ - struct cgroup_pidlist *l; - if (!(file->f_mode & FMODE_READ)) - return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); - return seq_release(inode, file); -} - -static const struct file_operations cgroup_pidlist_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, - .release = cgroup_pidlist_release, -}; - -/* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. - */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; - int retval; - - /* Nothing to do for write-only files */ - if (!(file->f_mode & FMODE_READ)) - return 0; - - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; - - retval = seq_open(file, &cgroup_pidlist_seq_operations); - if (retval) { - cgroup_release_pid_array(l); - return retval; - } - ((struct seq_file *)file->private_data)->private = l; - return 0; -} -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} - -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) -{ - return notify_on_release(cgrp); -} - -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) -{ - clear_bit(CGRP_RELEASABLE, &cgrp->flags); - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - return 0; -} - -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup *cgrp = event->cgrp; - - event->cft->unregister_event(cgrp, event->cft, event->eventfd); - - eventfd_ctx_put(event->eventfd); - kfree(event); - dput(cgrp->dentry); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - __remove_wait_queue(event->wqh, &event->wait); - spin_lock(&cgrp->event_list_lock); - list_del(&event->list); - spin_unlock(&cgrp->event_list_lock); - /* - * We are in atomic context, but cgroup_event_remove() may - * sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format ' '. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) -{ - struct cgroup_event *event = NULL; - unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - event->cgrp = cgrp; - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = eventfd_fget(efd); - if (IS_ERR(efile)) { - ret = PTR_ERR(efile); - goto fail; - } - - event->eventfd = eventfd_ctx_fileget(efile); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto fail; - } - - cfile = fget(cfd); - if (!cfile) { - ret = -EBADF; - goto fail; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); - if (ret < 0) - goto fail; - - event->cft = __file_cft(cfile); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto fail; - } - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto fail; - } - - ret = event->cft->register_event(cgrp, event->cft, - event->eventfd, buffer); - if (ret) - goto fail; - - if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { - event->cft->unregister_event(cgrp, event->cft, event->eventfd); - ret = 0; - goto fail; - } - - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fput(cfile); - fput(efile); - - return 0; - -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - - kfree(event); - - return ret; -} - -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) -{ - return clone_children(cgrp); -} - -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) -{ - if (val) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); - else - clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); - return 0; -} - -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { - { - .name = "tasks", - .open = cgroup_tasks_open, - .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", - .open = cgroup_procs_open, - .write_u64 = cgroup_procs_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, -}; - -static struct cftype cft_release_agent = { - .name = "release_agent", - .read_seq_string = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, -}; - -static int cgroup_populate_dir(struct cgroup *cgrp) -{ - int err; - struct cgroup_subsys *ss; - - /* First clear out any existing files */ - cgroup_clear_directory(cgrp->dentry); - - err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); - if (err < 0) - return err; - - if (cgrp == cgrp->top_cgroup) { - if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) - return err; - } - - for_each_subsys(cgrp->root, ss) { - if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) - return err; - } - /* This cgroup is ready now */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (css->id) - rcu_assign_pointer(css->id->css, css); - } - - return 0; -} - -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) -{ - css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); - css->flags = 0; - css->id = NULL; - if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); - BUG_ON(cgrp->subsys[ss->subsys_id]); - cgrp->subsys[ss->subsys_id] = css; -} - -static void cgroup_lock_hierarchy(struct cgroupfs_root *root) -{ - /* We need to take each hierarchy_mutex in a consistent order */ - int i; - - /* - * No worry about a race with rebind_subsystems that might mess up the - * locking order, since both parties are under cgroup_mutex. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_lock(&ss->hierarchy_mutex); - } -} - -static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) -{ - int i; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_unlock(&ss->hierarchy_mutex); - } -} - -/* - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held - */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) -{ - struct cgroup *cgrp; - struct cgroupfs_root *root = parent->root; - int err = 0; - struct cgroup_subsys *ss; - struct super_block *sb = root->sb; - - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); - - mutex_lock(&cgroup_mutex); - - init_cgroup_housekeeping(cgrp); - - cgrp->parent = parent; - cgrp->root = parent->root; - cgrp->top_cgroup = parent->top_cgroup; - - if (notify_on_release(parent)) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - - if (clone_children(parent)) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); - - for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); - - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_destroy; - } - init_cgroup_css(css, ss, cgrp); - if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); - if (err) - goto err_destroy; - } - /* At error, ->destroy() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); - } - - cgroup_lock_hierarchy(root); - list_add(&cgrp->sibling, &cgrp->parent->children); - cgroup_unlock_hierarchy(root); - root->number_of_cgroups++; - - err = cgroup_create_dir(cgrp, dentry, mode); - if (err < 0) - goto err_remove; - - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); - - err = cgroup_populate_dir(cgrp); - /* If err < 0, we have a half-filled directory - oh well ;) */ - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - - return 0; - - err_remove: - - cgroup_lock_hierarchy(root); - list_del(&cgrp->sibling); - cgroup_unlock_hierarchy(root); - root->number_of_cgroups--; - - err_destroy: - - for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } - - mutex_unlock(&cgroup_mutex); - - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); - - kfree(cgrp); - return err; -} - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct cgroup *c_parent = dentry->d_parent->d_fsdata; - - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); -} - -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ - /* Check the reference count on each subsystem. Since we - * already established that there are no tasks in the - * cgroup, if the css refcount is also 1, then there should - * be no outstanding references, so the subsystem is safe to - * destroy. We scan across all subsystems rather than using - * the per-hierarchy linked list of mounted subsystems since - * we can be called via check_for_release() with no - * synchronization other than RCU, and the subsystem linked - * list isn't RCU-safe */ - int i; - /* - * We won't need to lock the subsys array, because the subsystems - * we're concerned about aren't going anywhere since our cgroup root - * has a reference on them. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - struct cgroup_subsys_state *css; - /* Skip subsystems not present or not in this hierarchy */ - if (ss == NULL || ss->root != cgrp->root) - continue; - css = cgrp->subsys[ss->subsys_id]; - /* When called from check_for_release() it's possible - * that by this point the cgroup has been removed - * and the css deleted. But a false-positive doesn't - * matter, since it can only happen if the cgroup - * has been deleted and hence no longer needs the - * release agent to be called anyway. */ - if (css && (atomic_read(&css->refcnt) > 1)) - return 1; - } - return 0; -} - -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - */ - -static int cgroup_clear_css_refs(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; - local_irq_save(flags); - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - int refcnt; - while (1) { - /* We can only remove a CSS with a refcnt==1 */ - refcnt = atomic_read(&css->refcnt); - if (refcnt > 1) { - failed = true; - goto done; - } - BUG_ON(!refcnt); - /* - * Drop the refcnt to 0 while we check other - * subsystems. This will cause any racing - * css_tryget() to spin until we set the - * CSS_REMOVED bits or abort - */ - if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) - break; - cpu_relax(); - } - } - done: - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - if (failed) { - /* - * Restore old refcnt if we previously managed - * to clear it from 1 to 0 - */ - if (!atomic_read(&css->refcnt)) - atomic_set(&css->refcnt, 1); - } else { - /* Commit the fact that the CSS is removed */ - set_bit(CSS_REMOVED, &css->flags); - } - } - local_irq_restore(flags); - return !failed; -} - -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -{ - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; - DEFINE_WAIT(wait); - struct cgroup_event *event, *tmp; - int ret; - - /* the vfs holds both inode->i_mutex already */ -again: - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - mutex_unlock(&cgroup_mutex); - - /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; - } - - mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; - } - /* NO css_tryget() can success after here. */ - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - - raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); - if (!list_empty(&cgrp->release_list)) - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - - cgroup_lock_hierarchy(cgrp->root); - /* delete this cgroup from parent->children */ - list_del_init(&cgrp->sibling); - cgroup_unlock_hierarchy(cgrp->root); - - d = dget(cgrp->dentry); - - cgroup_d_remove_dir(d); - dput(d); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); - - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del(&event->list); - remove_wait_queue(event->wqh, &event->wait); - eventfd_signal(event->eventfd, 1); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - - /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; - css = ss->create(ss, dummytop); - /* We don't handle early failures gracefully */ - BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, dummytop); - - /* Update the init_css_set to contain a subsys - * pointer to this state - since the subsystem is - * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; - - need_forkexit_callback |= ss->fork || ss->exit; - - /* At system boot, before all subsystems have been - * registered, no tasks have been forked, so we don't - * need to invoke fork callbacks here. */ - BUG_ON(!list_empty(&init_task.tasks)); - - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); - ss->active = 1; - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); -} - -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - int i; - struct cgroup_subsys_state *css; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->create == NULL || ss->destroy == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a few sanity checks */ - BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); - BUG_ON(subsys[ss->subsys_id] != ss); - return 0; - } - - /* - * need to register a subsys id before anything else - for example, - * init_cgroup_css needs it. - */ - mutex_lock(&cgroup_mutex); - /* find the first empty slot in the array */ - for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { - if (subsys[i] == NULL) - break; - } - if (i == CGROUP_SUBSYS_COUNT) { - /* maximum number of subsystems already registered! */ - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - /* assign ourselves the subsys_id */ - ss->subsys_id = i; - subsys[i] = ss; - - /* - * no ss->create seems to need anything important in the ss struct, so - * this can happen first (i.e. before the rootnode attachment). - */ - css = ss->create(ss, dummytop); - if (IS_ERR(css)) { - /* failure case - need to deassign the subsys[] slot. */ - subsys[i] = NULL; - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, dummytop); - /* init_idr must be after init_cgroup_css because it sets css->id. */ - if (ss->use_id) { - int ret = cgroup_init_idr(ss, css); - if (ret) { - dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); - subsys[i] = NULL; - mutex_unlock(&cgroup_mutex); - return ret; - } - } - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct css_set *cg; - struct hlist_node *node, *tmp; - struct hlist_head *bucket = &css_set_table[i], *new_bucket; - - hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { - /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hlist_del(&cg->hlist); - /* set new value */ - cg->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - new_bucket = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, new_bucket); - } - } - write_unlock(&css_set_lock); - - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); - ss->active = 1; - - /* success! */ - mutex_unlock(&cgroup_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cg_cgroup_link *link; - struct hlist_head *hhead; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &rootnode); - - mutex_lock(&cgroup_mutex); - /* deassign the subsys_id */ - BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); - subsys[ss->subsys_id] = NULL; - - /* remove subsystem from rootnode's list of subsystems */ - list_del_init(&ss->sibling); - - /* - * disentangle the css from all css_sets attached to the dummytop. as - * in loading, we need to pay our respects to the hashtable gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; - - hlist_del(&cg->hlist); - BUG_ON(!cg->subsys[ss->subsys_id]); - cg->subsys[ss->subsys_id] = NULL; - hhead = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, hhead); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the dummytop and free it - need to free - * before marking as null because ss->destroy needs the cgrp->subsys - * pointer to find their state. note that this also takes care of - * freeing the css_id. - */ - ss->destroy(ss, dummytop); - dummytop->subsys[ss->subsys_id] = NULL; - - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); - -/** - * cgroup_init_early - cgroup initialization at system boot - * - * Initialize cgroups at system boot, and initialize any - * subsystems that request early init. - */ -int __init cgroup_init_early(void) -{ - int i; - atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cg_links); - INIT_LIST_HEAD(&init_css_set.tasks); - INIT_HLIST_NODE(&init_css_set.hlist); - css_set_count = 1; - init_cgroup_root(&rootnode); - root_count = 1; - init_task.cgroups = &init_css_set; - - init_css_set_link.cg = &init_css_set; - init_css_set_link.cgrp = dummytop; - list_add(&init_css_set_link.cgrp_link_list, - &rootnode.top_cgroup.css_sets); - list_add(&init_css_set_link.cg_link_list, - &init_css_set.cg_links); - - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } - - if (ss->early_init) - cgroup_init_subsys(ss); - } - return 0; -} - -/** - * cgroup_init - cgroup initialization - * - * Register cgroup filesystem and /proc file, and initialize - * any subsystems that didn't request early init. - */ -int __init cgroup_init(void) -{ - int err; - int i; - struct hlist_head *hhead; - - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; - - /* at bootup time, we don't worry about modular subsystems */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (!ss->early_init) - cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); - } - - /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); - BUG_ON(!init_root_id(&rootnode)); - - cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; - } - - err = register_filesystem(&cgroup_fs_type); - if (err < 0) { - kobject_put(cgroup_kobj); - goto out; - } - - proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; -} - -/* - * proc_cgroup_show() - * - Print task's cgroup paths into seq_file, one line for each hierarchy - * - Used for /proc//cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. - */ - -/* TODO: Use a proper seq_file iterator */ -static int proc_cgroup_show(struct seq_file *m, void *v) -{ - struct pid *pid; - struct task_struct *tsk; - char *buf; - int retval; - struct cgroupfs_root *root; - - retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) - goto out; - - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = 0; - - mutex_lock(&cgroup_mutex); - - for_each_active_root(root) { - struct cgroup_subsys *ss; - struct cgroup *cgrp; - int count = 0; - - seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); - if (strlen(root->name)) - seq_printf(m, "%sname=%s", count ? "," : "", - root->name); - seq_putc(m, ':'); - cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) - goto out_unlock; - seq_puts(m, buf); - seq_putc(m, '\n'); - } - -out_unlock: - mutex_unlock(&cgroup_mutex); - put_task_struct(tsk); -out_free: - kfree(buf); -out: - return retval; -} - -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); - } - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -static const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/** - * cgroup_fork - attach newly forked task to its parents cgroup. - * @child: pointer to task_struct of forking parent process. - * - * Description: A task inherits its parent's cgroup at fork(). - * - * A pointer to the shared css_set was automatically copied in - * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. - * - * At the point that cgroup_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. - */ -void cgroup_fork(struct task_struct *child) -{ - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ - child->cgroups = current->cgroups; - get_css_set(child->cgroups); - INIT_LIST_HEAD(&child->cg_list); -} - -/** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - /* - * forkexit callbacks are only supported for builtin - * subsystems, and the builtin section of the subsys array is - * immutable, so we don't need to lock the subsys array here. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->fork) - ss->fork(ss, child); - } - } -} - -/** - * cgroup_post_fork - called on a new task after adding it to the task list - * @child: the task in question - * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. - */ -void cgroup_post_fork(struct task_struct *child) -{ - if (use_task_css_set_links) { - write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { - /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. - */ - list_add(&child->cg_list, &child->cgroups->tasks); - } - write_unlock(&css_set_lock); - } -} -/** - * cgroup_exit - detach cgroup from exiting task - * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? - * - * Description: Detach cgroup from @tsk and release it. - * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. - * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. - */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) -{ - struct css_set *cg; - int i; - - /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock - */ - if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); - } - - /* Reassign the task to the init_css_set. */ - task_lock(tsk); - cg = tsk->cgroups; - tsk->cgroups = &init_css_set; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) { - struct cgroup *old_cgrp = - rcu_dereference_raw(cg->subsys[i])->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); - } - } - } - task_unlock(tsk); - - if (cg) - put_css_set_taskexit(cg); -} - -/** - * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp - * @cgrp: the cgroup in question - * @task: the task in question - * - * See if @cgrp is a descendant of @task's cgroup in the appropriate - * hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) -{ - int ret; - struct cgroup *target; - - if (cgrp == dummytop) - return 1; - - target = task_cgroup_from_root(task, cgrp->root); - while (cgrp != target && cgrp!= cgrp->top_cgroup) - cgrp = cgrp->parent; - ret = (cgrp == target); - return ret; -} - -static void check_for_release(struct cgroup *cgrp) -{ - /* All of these checks rely on RCU to keep the cgroup - * structure alive */ - if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { - /* Control Group is currently removeable. If it's not - * already queued for a userspace notification, queue - * it now */ - int need_schedule_work = 0; - raw_spin_lock(&release_list_lock); - if (!cgroup_is_removed(cgrp) && - list_empty(&cgrp->release_list)) { - list_add(&cgrp->release_list, &release_list); - need_schedule_work = 1; - } - raw_spin_unlock(&release_list_lock); - if (need_schedule_work) - schedule_work(&release_agent_work); - } -} - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css, int count) -{ - struct cgroup *cgrp = css->cgroup; - int val; - rcu_read_lock(); - val = atomic_sub_return(count, &css->refcnt); - if (val == 1) { - if (notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - cgroup_wakeup_rmdir_waiter(cgrp); - } - rcu_read_unlock(); - WARN_ON_ONCE(val < 1); -} -EXPORT_SYMBOL_GPL(__css_put); - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -static void cgroup_release_agent(struct work_struct *work) -{ - BUG_ON(work != &release_agent_work); - mutex_lock(&cgroup_mutex); - raw_spin_lock(&release_list_lock); - while (!list_empty(&release_list)) { - char *argv[3], *envp[3]; - int i; - char *pathbuf = NULL, *agentbuf = NULL; - struct cgroup *cgrp = list_entry(release_list.next, - struct cgroup, - release_list); - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!pathbuf) - goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) - goto continue_free; - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!agentbuf) - goto continue_free; - - i = 0; - argv[i++] = agentbuf; - argv[i++] = pathbuf; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - /* Drop the lock while we invoke the usermode helper, - * since the exec could involve hitting disk and hence - * be a slow process */ - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - mutex_lock(&cgroup_mutex); - continue_free: - kfree(pathbuf); - kfree(agentbuf); - raw_spin_lock(&release_list_lock); - } - raw_spin_unlock(&release_list_lock); - mutex_unlock(&cgroup_mutex); -} - -static int __init cgroup_disable(char *str) -{ - int i; - char *token; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - /* - * cgroup_disable, being at boot time, can't know about module - * subsystems, so we don't worry about them. - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - if (!strcmp(token, ss->name)) { - ss->disabled = 1; - printk(KERN_INFO "Disabling %s control group" - " subsystem\n", ss->name); - break; - } - } - } - return 1; -} -__setup("cgroup_disable=", cgroup_disable); - -/* - * Functons for CSS ID. - */ - -/* - *To get ID other than 0, this should be called when !cgroup_is_removed(). - */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -unsigned short css_depth(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); - - if (cssid) - return cssid->depth; - return 0; -} -EXPORT_SYMBOL_GPL(css_depth); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - bool ret = true; - - rcu_read_lock(); - child_id = rcu_dereference(child->id); - root_id = rcu_dereference(root->id); - if (!child_id - || !root_id - || (child_id->depth < root_id->depth) - || (child_id->stack[root_id->depth] != root_id->id)) - ret = false; - rcu_read_unlock(); - return ret; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = css->id; - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int myid, error, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - /* get id */ - if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { - error = -ENOMEM; - goto err_out; - } - write_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); - - /* Returns error when there are no free spaces for new ID.*/ - if (error) { - error = -ENOSPC; - goto err_out; - } - if (myid > CSS_ID_MAX) - goto remove_idr; - - newid->id = myid; - newid->depth = depth; - return newid; -remove_idr: - error = -ENOSPC; - write_lock(&ss->id_lock); - idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); -err_out: - kfree(newid); - return ERR_PTR(error); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - rwlock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - newid->css = rootcss; - rootcss->id = newid; - return 0; -} - -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) -{ - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; - struct css_id *child_id, *parent_id; - - subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; - parent_id = parent_css->id; - depth = parent_id->depth + 1; - - child_id = get_new_cssid(ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - read_lock(&ss->id_lock); - tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); - - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - -/* - * get corresponding css from file open on cgroupfs directory - */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) -{ - struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; - - inode = f->f_dentry->d_inode; - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) - return ERR_PTR(-EBADF); - - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); - - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; - return css ? css : ERR_PTR(-ENOENT); -} - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - return cgroup_task_count(cont); -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct cgroup *cont, - struct cftype *cft, - struct seq_file *seq) -{ - struct cg_cgroup_link *link; - struct css_set *cg; - - read_lock(&css_set_lock); - rcu_read_lock(); - cg = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cg->cg_links, cg_link_list) { - struct cgroup *c = link->cgrp; - const char *name; - - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); - } - rcu_read_unlock(); - read_unlock(&css_set_lock); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, - struct cftype *cft, - struct seq_file *seq) -{ - struct cg_cgroup_link *link; - - read_lock(&css_set_lock); - list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; - struct task_struct *task; - int count = 0; - seq_printf(seq, "css_set %p\n", cg); - list_for_each_entry(task, &cg->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } - } - } - read_unlock(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype debug_files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, debug_files, - ARRAY_SIZE(debug_files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; -#endif /* CONFIG_CGROUP_DEBUG */ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include -#include -#include -#include -#include -#include -#include - -enum freezer_state { - CGROUP_THAWED = 0, - CGROUP_FREEZING, - CGROUP_FROZEN, -}; - -struct freezer { - struct cgroup_subsys_state css; - enum freezer_state state; - spinlock_t lock; /* protects _writes_ to state */ -}; - -static inline struct freezer *cgroup_freezer( - struct cgroup *cgroup) -{ - return container_of( - cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); -} - -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return container_of(task_subsys_state(task, freezer_subsys_id), - struct freezer, css); -} - -bool cgroup_freezing(struct task_struct *task) -{ - enum freezer_state state; - bool ret; - - rcu_read_lock(); - state = task_freezer(task)->state; - ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; - rcu_read_unlock(); - - return ret; -} - -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */ -static const char *freezer_state_strs[] = { - "THAWED", - "FREEZING", - "FROZEN", -}; - -/* - * State diagram - * Transitions are caused by userspace writes to the freezer.state file. - * The values in parenthesis are state labels. The rest are edge labels. - * - * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______THAWED_______/ | - * \__________________________THAWED____________/ - */ - -struct cgroup_subsys freezer_subsys; - -/* Locks taken and their ordering - * ------------------------------ - * cgroup_mutex (AKA cgroup_lock) - * freezer->lock - * css_set_lock - * task->alloc_lock (AKA task_lock) - * task->sighand->siglock - * - * cgroup code forces css_set_lock to be taken before task->alloc_lock - * - * freezer_create(), freezer_destroy(): - * cgroup_mutex [ by cgroup core ] - * - * freezer_can_attach(): - * cgroup_mutex (held by caller of can_attach) - * - * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * freezer->lock - * sighand->siglock (if the cgroup is freezing) - * - * freezer_read(): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * - * freezer_write() (freeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock (fake signal delivery inside freeze_task()) - * - * freezer_write() (unfreeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) - * sighand->siglock - */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct freezer *freezer; - - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&freezer->lock); - freezer->state = CGROUP_THAWED; - return &freezer->css; -} - -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct freezer *freezer = cgroup_freezer(cgroup); - - if (freezer->state != CGROUP_THAWED) - atomic_dec(&system_freezing_cnt); - kfree(freezer); -} - -/* task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - -/* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. - */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, - struct cgroup_taskset *tset) -{ - struct freezer *freezer; - struct task_struct *task; - - /* - * Anything frozen can't move or be moved to/from. - */ - cgroup_taskset_for_each(task, new_cgroup, tset) - if (cgroup_freezing(task)) - return -EBUSY; - - freezer = cgroup_freezer(new_cgroup); - if (freezer->state != CGROUP_THAWED) - return -EBUSY; - - return 0; -} - -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) -{ - struct freezer *freezer; - - /* - * No lock is needed, since the task isn't on tasklist yet, - * so it can't be moved to another cgroup, which means the - * freezer won't be removed and will be valid during this - * function call. Nevertheless, apply RCU read-side critical - * section to suppress RCU lockdep false positives. - */ - rcu_read_lock(); - freezer = task_freezer(task); - rcu_read_unlock(); - - /* - * The root cgroup is non-freezable, so we can skip the - * following check. - */ - if (!freezer->css.cgroup->parent) - return; - - spin_lock_irq(&freezer->lock); - BUG_ON(freezer->state == CGROUP_FROZEN); - - /* Locking avoids race with FREEZING -> THAWED transitions. */ - if (freezer->state == CGROUP_FREEZING) - freeze_task(task); - spin_unlock_irq(&freezer->lock); -} - -/* - * caller must hold freezer->lock - */ -static void update_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) -{ - struct cgroup_iter it; - struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - enum freezer_state old_state = freezer->state; - - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (freezing(task) && is_task_frozen_enough(task)) - nfrozen++; - } - - if (old_state == CGROUP_THAWED) { - BUG_ON(nfrozen > 0); - } else if (old_state == CGROUP_FREEZING) { - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - } else { /* old_state == CGROUP_FROZEN */ - BUG_ON(nfrozen != ntotal); - } - - cgroup_iter_end(cgroup, &it); -} - -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct freezer *freezer; - enum freezer_state state; - - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - - freezer = cgroup_freezer(cgroup); - spin_lock_irq(&freezer->lock); - state = freezer->state; - if (state == CGROUP_FREEZING) { - /* We change from FREEZING to FROZEN lazily if the cgroup was - * only partially frozen when we exitted write. */ - update_if_frozen(cgroup, freezer); - state = freezer->state; - } - spin_unlock_irq(&freezer->lock); - cgroup_unlock(); - - seq_puts(m, freezer_state_strs[state]); - seq_putc(m, '\n'); - return 0; -} - -static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) -{ - struct cgroup_iter it; - struct task_struct *task; - unsigned int num_cant_freeze_now = 0; - - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task)) - continue; - if (is_task_frozen_enough(task)) - continue; - if (!freezing(task) && !freezer_should_skip(task)) - num_cant_freeze_now++; - } - cgroup_iter_end(cgroup, &it); - - return num_cant_freeze_now ? -EBUSY : 0; -} - -static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) -{ - struct cgroup_iter it; - struct task_struct *task; - - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) - __thaw_task(task); - cgroup_iter_end(cgroup, &it); -} - -static int freezer_change_state(struct cgroup *cgroup, - enum freezer_state goal_state) -{ - struct freezer *freezer; - int retval = 0; - - freezer = cgroup_freezer(cgroup); - - spin_lock_irq(&freezer->lock); - - update_if_frozen(cgroup, freezer); - - switch (goal_state) { - case CGROUP_THAWED: - if (freezer->state != CGROUP_THAWED) - atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; - unfreeze_cgroup(cgroup, freezer); - break; - case CGROUP_FROZEN: - if (freezer->state == CGROUP_THAWED) - atomic_inc(&system_freezing_cnt); - freezer->state = CGROUP_FREEZING; - retval = try_to_freeze_cgroup(cgroup, freezer); - break; - default: - BUG(); - } - - spin_unlock_irq(&freezer->lock); - - return retval; -} - -static int freezer_write(struct cgroup *cgroup, - struct cftype *cft, - const char *buffer) -{ - int retval; - enum freezer_state goal_state; - - if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) - goal_state = CGROUP_THAWED; - else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) - goal_state = CGROUP_FROZEN; - else - return -EINVAL; - - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - retval = freezer_change_state(cgroup, goal_state); - cgroup_unlock(); - return retval; -} - -static struct cftype files[] = { - { - .name = "state", - .read_seq_string = freezer_read, - .write_string = freezer_write, - }, -}; - -static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) -{ - if (!cgroup->parent) - return 0; - return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys freezer_subsys = { - .name = "freezer", - .create = freezer_create, - .destroy = freezer_destroy, - .populate = freezer_populate, - .subsys_id = freezer_subsys_id, - .can_attach = freezer_can_attach, - .fork = freezer_fork, -}; -/* - * linux/kernel/compat.c - * - * Kernel compatibililty routines for e.g. 32 bit syscall support - * on 64 bit kernels. - * - * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include /* for MAX_SCHEDULE_TIMEOUT */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. - */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) -{ - long usec; - - if (get_user(o->tv_sec, &i->tv_sec) || - get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) -{ - return (put_user(i->tv_sec, &o->tv_sec) || - put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; -} - -static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) -{ - memset(txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc->modes, &utp->modes) || - __get_user(txc->offset, &utp->offset) || - __get_user(txc->freq, &utp->freq) || - __get_user(txc->maxerror, &utp->maxerror) || - __get_user(txc->esterror, &utp->esterror) || - __get_user(txc->status, &utp->status) || - __get_user(txc->constant, &utp->constant) || - __get_user(txc->precision, &utp->precision) || - __get_user(txc->tolerance, &utp->tolerance) || - __get_user(txc->time.tv_sec, &utp->time.tv_sec) || - __get_user(txc->time.tv_usec, &utp->time.tv_usec) || - __get_user(txc->tick, &utp->tick) || - __get_user(txc->ppsfreq, &utp->ppsfreq) || - __get_user(txc->jitter, &utp->jitter) || - __get_user(txc->shift, &utp->shift) || - __get_user(txc->stabil, &utp->stabil) || - __get_user(txc->jitcnt, &utp->jitcnt) || - __get_user(txc->calcnt, &utp->calcnt) || - __get_user(txc->errcnt, &utp->errcnt) || - __get_user(txc->stbcnt, &utp->stbcnt)) - return -EFAULT; - - return 0; -} - -static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) -{ - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc->modes, &utp->modes) || - __put_user(txc->offset, &utp->offset) || - __put_user(txc->freq, &utp->freq) || - __put_user(txc->maxerror, &utp->maxerror) || - __put_user(txc->esterror, &utp->esterror) || - __put_user(txc->status, &utp->status) || - __put_user(txc->constant, &utp->constant) || - __put_user(txc->precision, &utp->precision) || - __put_user(txc->tolerance, &utp->tolerance) || - __put_user(txc->time.tv_sec, &utp->time.tv_sec) || - __put_user(txc->time.tv_usec, &utp->time.tv_usec) || - __put_user(txc->tick, &utp->tick) || - __put_user(txc->ppsfreq, &utp->ppsfreq) || - __put_user(txc->jitter, &utp->jitter) || - __put_user(txc->shift, &utp->shift) || - __put_user(txc->stabil, &utp->stabil) || - __put_user(txc->jitcnt, &utp->jitcnt) || - __put_user(txc->calcnt, &utp->calcnt) || - __put_user(txc->errcnt, &utp->errcnt) || - __put_user(txc->stbcnt, &utp->stbcnt) || - __put_user(txc->tai, &utp->tai)) - return -EFAULT; - return 0; -} - -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - - return 0; -} - -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (compat_get_timeval(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - -int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) -{ - return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) -{ - return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} -EXPORT_SYMBOL_GPL(put_compat_timespec); - -static long compat_nanosleep_restart(struct restart_block *restart) -{ - struct compat_timespec __user *rmtp; - struct timespec rmt; - mm_segment_t oldfs; - long ret; - - restart->nanosleep.rmtp = (struct timespec __user *) &rmt; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep_restart(restart); - set_fs(oldfs); - - if (ret) { - rmtp = restart->nanosleep.compat_rmtp; - - if (rmtp && put_compat_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) -{ - struct timespec tu, rmt; - mm_segment_t oldfs; - long ret; - - if (get_compat_timespec(&tu, rqtp)) - return -EFAULT; - - if (!timespec_valid(&tu)) - return -EINVAL; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu, - rmtp ? (struct timespec __user *)&rmt : NULL, - HRTIMER_MODE_REL, CLOCK_MONOTONIC); - set_fs(oldfs); - - if (ret) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; - - restart->fn = compat_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - - if (rmtp && put_compat_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -static inline long get_compat_itimerval(struct itimerval *o, - struct compat_itimerval __user *i) -{ - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | - __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | - __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | - __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); -} - -static inline long put_compat_itimerval(struct compat_itimerval __user *o, - struct itimerval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | - __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | - __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | - __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); -} - -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) -{ - struct itimerval kit; - int error; - - error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) - error = -EFAULT; - return error; -} - -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) -{ - struct itimerval kin, kout; - int error; - - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - if (put_compat_itimerval(out, &kout)) - return -EFAULT; - return 0; -} - -static compat_clock_t clock_t_to_compat_clock_t(clock_t x) -{ - return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); -} - -asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) -{ - if (tbuf) { - struct tms tms; - struct compat_tms tmp; - - do_sys_times(&tms); - /* Convert our struct tms to the compat version. */ - tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); - tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); - tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); - tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); - if (copy_to_user(tbuf, &tmp, sizeof(tmp))) - return -EFAULT; - } - force_successful_syscall_return(); - return compat_jiffies_to_clock_t(jiffies); -} - -#ifdef __ARCH_WANT_SYS_SIGPENDING - -/* - * Assumption: old_sigset_t and compat_old_sigset_t are both - * types that can be passed to put_user()/get_user(). - */ - -asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sigpending((old_sigset_t __user *) &s); - set_fs(old_fs); - if (ret == 0) - ret = put_user(s, set); - return ret; -} - -#endif - -#ifdef __ARCH_WANT_SYS_SIGPROCMASK - -asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, - compat_old_sigset_t __user *oset) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs; - - if (set && get_user(s, set)) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_sigprocmask(how, - set ? (old_sigset_t __user *) &s : NULL, - oset ? (old_sigset_t __user *) &s : NULL); - set_fs(old_fs); - if (ret == 0) - if (oset) - ret = put_user(s, oset); - return ret; -} - -#endif - -asmlinkage long compat_sys_setrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - - if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || - __get_user(r.rlim_cur, &rlim->rlim_cur) || - __get_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - - if (r.rlim_cur == COMPAT_RLIM_INFINITY) - r.rlim_cur = RLIM_INFINITY; - if (r.rlim_max == COMPAT_RLIM_INFINITY) - r.rlim_max = RLIM_INFINITY; - return do_prlimit(current, resource, &r, NULL); -} - -#ifdef COMPAT_RLIM_OLD_INFINITY - -asmlinkage long compat_sys_old_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, &r); - set_fs(old_fs); - - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -#endif - -asmlinkage long compat_sys_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) -{ - struct rlimit r; - int ret; - - ret = do_prlimit(current, resource, NULL, &r); - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) -{ - if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || - __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || - __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || - __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || - __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || - __put_user(r->ru_maxrss, &ru->ru_maxrss) || - __put_user(r->ru_ixrss, &ru->ru_ixrss) || - __put_user(r->ru_idrss, &ru->ru_idrss) || - __put_user(r->ru_isrss, &ru->ru_isrss) || - __put_user(r->ru_minflt, &ru->ru_minflt) || - __put_user(r->ru_majflt, &ru->ru_majflt) || - __put_user(r->ru_nswap, &ru->ru_nswap) || - __put_user(r->ru_inblock, &ru->ru_inblock) || - __put_user(r->ru_oublock, &ru->ru_oublock) || - __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || - __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || - __put_user(r->ru_nsignals, &ru->ru_nsignals) || - __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || - __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) - return -EFAULT; - return 0; -} - -asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) -{ - struct rusage r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrusage(who, (struct rusage __user *) &r); - set_fs(old_fs); - - if (ret) - return ret; - - if (put_compat_rusage(&r, ru)) - return -EFAULT; - - return 0; -} - -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) -{ - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; - } -} - -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) -{ - siginfo_t info; - struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); - - set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - ret = put_compat_rusage(&ru, uru); - if (ret) - return ret; - } - - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(uinfo, &info); -} - -static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, - unsigned len, struct cpumask *new_mask) -{ - unsigned long *k; - - if (len < cpumask_size()) - memset(new_mask, 0, cpumask_size()); - else if (len > cpumask_size()) - len = cpumask_size(); - - k = cpumask_bits(new_mask); - return compat_get_bitmap(k, user_mask_ptr, len * 8); -} - -asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, - unsigned int len, - compat_ulong_t __user *user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval) - goto out; - - retval = sched_setaffinity(pid, new_mask); -out: - free_cpumask_var(new_mask); - return retval; -} - -asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, - compat_ulong_t __user *user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(compat_ulong_t)-1)) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); - - if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (get_compat_timespec(&dst->it_interval, &src->it_interval) || - get_compat_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (put_compat_timespec(&src->it_interval, &dst->it_interval) || - put_compat_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - -long compat_sys_timer_create(clockid_t which_clock, - struct compat_sigevent __user *timer_event_spec, - timer_t __user *created_timer_id) -{ - struct sigevent __user *event = NULL; - - if (timer_event_spec) { - struct sigevent kevent; - - event = compat_alloc_user_space(sizeof(*event)); - if (get_compat_sigevent(&kevent, timer_event_spec) || - copy_to_user(event, &kevent, sizeof(*event))) - return -EFAULT; - } - - return sys_timer_create(which_clock, event, created_timer_id); -} - -long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old) -{ - long err; - mm_segment_t oldfs; - struct itimerspec newts, oldts; - - if (!new) - return -EINVAL; - if (get_compat_itimerspec(&newts, new)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_settime(timer_id, flags, - (struct itimerspec __user *) &newts, - (struct itimerspec __user *) &oldts); - set_fs(oldfs); - if (!err && old && put_compat_itimerspec(old, &oldts)) - return -EFAULT; - return err; -} - -long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting) -{ - long err; - mm_segment_t oldfs; - struct itimerspec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_gettime(timer_id, - (struct itimerspec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_itimerspec(setting, &ts)) - return -EFAULT; - return err; -} - -long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - if (get_compat_timespec(&ts, tp)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_settime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - return err; -} - -long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_gettime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -long compat_sys_clock_adjtime(clockid_t which_clock, - struct compat_timex __user *utp) -{ - struct timex txc; - mm_segment_t oldfs; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); - set_fs(oldfs); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - -long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_getres(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && tp && put_compat_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -static long compat_clock_nanosleep_restart(struct restart_block *restart) -{ - long err; - mm_segment_t oldfs; - struct timespec tu; - struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; - - restart->nanosleep.rmtp = (struct timespec __user *) &tu; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = clock_nanosleep_restart(restart); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&tu, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) -{ - long err; - mm_segment_t oldfs; - struct timespec in, out; - struct restart_block *restart; - - if (get_compat_timespec(&in, rqtp)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_nanosleep(which_clock, flags, - (struct timespec __user *) &in, - (struct timespec __user *) &out); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&out, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -/* - * We currently only need the following fields from the sigevent - * structure: sigev_value, sigev_signo, sig_notify and (sometimes - * sigev_notify_thread_id). The others are handled in user mode. - * We also assume that copying sigev_value.sival_int is sufficient - * to keep all the bits of sigev_value.sival_ptr intact. - */ -int get_compat_sigevent(struct sigevent *event, - const struct compat_sigevent __user *u_event) -{ - memset(event, 0, sizeof(*event)); - return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || - __get_user(event->sigev_value.sival_int, - &u_event->sigev_value.sival_int) || - __get_user(event->sigev_signo, &u_event->sigev_signo) || - __get_user(event->sigev_notify, &u_event->sigev_notify) || - __get_user(event->sigev_notify_thread_id, - &u_event->sigev_notify_thread_id)) - ? -EFAULT : 0; -} - -long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, - unsigned long bitmap_size) -{ - int i, j; - unsigned long m; - compat_ulong_t um; - unsigned long nr_compat_longs; - - /* align bitmap up to nearest compat_long_t boundary */ - bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); - - if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) - return -EFAULT; - - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = 0; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - /* - * We dont want to read past the end of the userspace - * bitmap. We must however ensure the end of the - * kernel bitmap is zeroed. - */ - if (nr_compat_longs-- > 0) { - if (__get_user(um, umask)) - return -EFAULT; - } else { - um = 0; - } - - umask++; - m |= (long)um << (j * BITS_PER_COMPAT_LONG); - } - *mask++ = m; - } - - return 0; -} - -long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, - unsigned long bitmap_size) -{ - int i, j; - unsigned long m; - compat_ulong_t um; - unsigned long nr_compat_longs; - - /* align bitmap up to nearest compat_long_t boundary */ - bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); - - if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) - return -EFAULT; - - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = *mask++; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - um = m; - - /* - * We dont want to write past the end of the userspace - * bitmap. - */ - if (nr_compat_longs-- > 0) { - if (__put_user(um, umask)) - return -EFAULT; - } - - umask++; - m >>= 4*sizeof(um); - m >>= 4*sizeof(um); - } - } - - return 0; -} - -void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) -{ - switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); - } -} -EXPORT_SYMBOL_GPL(sigset_from_compat); - -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) -{ - compat_sigset_t s32; - sigset_t s; - struct timespec t; - siginfo_t info; - long ret; - - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&s, &s32); - - if (uts) { - if (get_compat_timespec(&t, uts)) - return -EFAULT; - } - - ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - - if (ret > 0 && uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - - return ret; - -} - -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); -} - -#ifdef __ARCH_WANT_COMPAT_SYS_TIME - -/* compat_time_t is a 32 bit "long" and needs to get converted. */ - -asmlinkage long compat_sys_time(compat_time_t __user * tloc) -{ - compat_time_t i; - struct timeval tv; - - do_gettimeofday(&tv); - i = tv.tv_sec; - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -asmlinkage long compat_sys_stime(compat_time_t __user *tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ - -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - -asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) -{ - struct timex txc; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - ret = do_adjtimex(&txc); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - -#ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} - -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, - const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} -#endif - -struct compat_sysinfo { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - u16 procs; - u16 pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(u32)-sizeof(int)]; -}; - -asmlinkage long -compat_sys_sysinfo(struct compat_sysinfo __user *info) -{ - struct sysinfo s; - - do_sysinfo(&s); - - /* Check to see if any memory value is too large for 32-bit and scale - * down if needed - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - int bitcount = 0; - - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - - s.totalram >>= bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || - __put_user (s.uptime, &info->uptime) || - __put_user (s.loads[0], &info->loads[0]) || - __put_user (s.loads[1], &info->loads[1]) || - __put_user (s.loads[2], &info->loads[2]) || - __put_user (s.totalram, &info->totalram) || - __put_user (s.freeram, &info->freeram) || - __put_user (s.sharedram, &info->sharedram) || - __put_user (s.bufferram, &info->bufferram) || - __put_user (s.totalswap, &info->totalswap) || - __put_user (s.freeswap, &info->freeswap) || - __put_user (s.procs, &info->procs) || - __put_user (s.totalhigh, &info->totalhigh) || - __put_user (s.freehigh, &info->freehigh) || - __put_user (s.mem_unit, &info->mem_unit)) - return -EFAULT; - - return 0; -} - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -/* - * kernel/configs.c - * Echo the kernel .config file used to build the kernel - * - * Copyright (C) 2002 Khalid Aziz - * Copyright (C) 2002 Randy Dunlap - * Copyright (C) 2002 Al Stone - * Copyright (C) 2002 Hewlett-Packard Company - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include - -/**************************************************/ -/* the actual current config file */ - -/* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * - * IKCFG_ED - */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) - -#ifdef CONFIG_IKCONFIG_PROC - -static ssize_t -ikconfig_read_current(struct file *file, char __user *buf, - size_t len, loff_t * offset) -{ - return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); -} - -static const struct file_operations ikconfig_file_ops = { - .owner = THIS_MODULE, - .read = ikconfig_read_current, - .llseek = default_llseek, -}; - -static int __init ikconfig_init(void) -{ - struct proc_dir_entry *entry; - - /* create the current config file */ - entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, - &ikconfig_file_ops); - if (!entry) - return -ENOMEM; - - entry->size = kernel_config_data_size; - - return 0; -} - -static void __exit ikconfig_cleanup(void) -{ - remove_proc_entry("config.gz", NULL); -} - -module_init(ikconfig_init); -module_exit(ikconfig_cleanup); - -#endif /* CONFIG_IKCONFIG_PROC */ - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Randy Dunlap"); -MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); -/* CPU control. - * (C) 2001, 2002, 2003, 2004 Rusty Russell - * - * This code is licenced under the GPL. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_SMP -/* Serializes the updates to cpu_online_mask, cpu_present_mask */ -static DEFINE_MUTEX(cpu_add_remove_lock); - -/* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. - */ -void cpu_maps_update_begin(void) -{ - mutex_lock(&cpu_add_remove_lock); -} - -void cpu_maps_update_done(void) -{ - mutex_unlock(&cpu_add_remove_lock); -} - -static RAW_NOTIFIER_HEAD(cpu_chain); - -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. - * Should always be manipulated under cpu_add_remove_lock - */ -static int cpu_hotplug_disabled; - -#ifdef CONFIG_HOTPLUG_CPU - -static struct { - struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - int refcount; -} cpu_hotplug = { - .active_writer = NULL, - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, -}; - -void get_online_cpus(void) -{ - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.refcount++; - mutex_unlock(&cpu_hotplug.lock); - -} -EXPORT_SYMBOL_GPL(get_online_cpus); - -void put_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return; - mutex_lock(&cpu_hotplug.lock); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); - -} -EXPORT_SYMBOL_GPL(put_online_cpus); - -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ -static void cpu_hotplug_begin(void) -{ - cpu_hotplug.active_writer = current; - - for (;;) { - mutex_lock(&cpu_hotplug.lock); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } -} - -static void cpu_hotplug_done(void) -{ - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); -} - -#else /* #if CONFIG_HOTPLUG_CPU */ -static void cpu_hotplug_begin(void) {} -static void cpu_hotplug_done(void) {} -#endif /* #else #if CONFIG_HOTPLUG_CPU */ - -/* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) -{ - int ret; - cpu_maps_update_begin(); - ret = raw_notifier_chain_register(&cpu_chain, nb); - cpu_maps_update_done(); - return ret; -} - -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, - int *nr_calls) -{ - int ret; - - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, - nr_calls); - - return notifier_to_errno(ret); -} - -static int cpu_notify(unsigned long val, void *v) -{ - return __cpu_notify(val, v, -1, NULL); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static void cpu_notify_nofail(unsigned long val, void *v) -{ - BUG_ON(cpu_notify(val, v)); -} -EXPORT_SYMBOL(register_cpu_notifier); - -void __ref unregister_cpu_notifier(struct notifier_block *nb) -{ - cpu_maps_update_begin(); - raw_notifier_chain_unregister(&cpu_chain, nb); - cpu_maps_update_done(); -} -EXPORT_SYMBOL(unregister_cpu_notifier); - -static inline void check_for_tasks(int cpu) -{ - struct task_struct *p; - - write_lock_irq(&tasklist_lock); - for_each_process(p) { - if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); - } - write_unlock_irq(&tasklist_lock); -} - -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; - -/* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) -{ - struct take_cpu_down_param *param = _param; - int err; - - /* Ensure this CPU doesn't handle any more interrupts. */ - err = __cpu_disable(); - if (err < 0) - return err; - - cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; -} - -/* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) -{ - int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } - - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); - if (err) { - /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); - - goto out_release; - } - BUG_ON(cpu_online(cpu)); - - /* - * The migration_call() CPU_DYING callback will have removed all - * runnable tasks from the cpu, there's only the idle task left now - * that the migration thread is done doing the stop_machine thing. - * - * Wait for the stop thread to go away. - */ - while (!idle_cpu(cpu)) - cpu_relax(); - - /* This actually kills the CPU. */ - __cpu_die(cpu); - - /* CPU is completely dead: tell everyone. Too late to complain. */ - cpu_notify_nofail(CPU_DEAD | mod, hcpu); - - check_for_tasks(cpu); - -out_release: - cpu_hotplug_done(); - if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); - return err; -} - -int __ref cpu_down(unsigned int cpu) -{ - int err; - - cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_down(cpu, 0); - -out: - cpu_maps_update_done(); - return err; -} -EXPORT_SYMBOL(cpu_down); -#endif /*CONFIG_HOTPLUG_CPU*/ - -/* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) -{ - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - - if (cpu_online(cpu) || !cpu_present(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); - if (ret) { - nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; - } - - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu); - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); - - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); - -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); - cpu_hotplug_done(); - - return ret; -} - -int __cpuinit cpu_up(unsigned int cpu) -{ - int err = 0; - -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; - pg_data_t *pgdat; -#endif - - if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); -#endif - return -EINVAL; - } - -#ifdef CONFIG_MEMORY_HOTPLUG - nid = cpu_to_node(cpu); - if (!node_online(nid)) { - err = mem_online_node(nid); - if (err) - return err; - } - - pgdat = NODE_DATA(nid); - if (!pgdat) { - printk(KERN_ERR - "Can't online cpu %d due to NULL pgdat\n", cpu); - return -ENOMEM; - } - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); - mutex_unlock(&zonelists_mutex); - } -#endif - - cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_up(cpu, 0); - -out: - cpu_maps_update_done(); - return err; -} -EXPORT_SYMBOL_GPL(cpu_up); - -#ifdef CONFIG_PM_SLEEP_SMP -static cpumask_var_t frozen_cpus; - -void __weak arch_disable_nonboot_cpus_begin(void) -{ -} - -void __weak arch_disable_nonboot_cpus_end(void) -{ -} - -int disable_nonboot_cpus(void) -{ - int cpu, first_cpu, error = 0; - - cpu_maps_update_begin(); - first_cpu = cpumask_first(cpu_online_mask); - /* - * We take down all of the non-boot CPUs in one shot to avoid races - * with the userspace trying to use the CPU hotplug at the same time - */ - cpumask_clear(frozen_cpus); - arch_disable_nonboot_cpus_begin(); - - printk("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - error = _cpu_down(cpu, 1); - if (!error) - cpumask_set_cpu(cpu, frozen_cpus); - else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); - break; - } - } - - arch_disable_nonboot_cpus_end(); - - if (!error) { - BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); - } - cpu_maps_update_done(); - return error; -} - -void __weak arch_enable_nonboot_cpus_begin(void) -{ -} - -void __weak arch_enable_nonboot_cpus_end(void) -{ -} - -void __ref enable_nonboot_cpus(void) -{ - int cpu, error; - - /* Allow everyone to use the CPU hotplug again */ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - if (cpumask_empty(frozen_cpus)) - goto out; - - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); - - arch_enable_nonboot_cpus_begin(); - - for_each_cpu(cpu, frozen_cpus) { - error = _cpu_up(cpu, 1); - if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); - continue; - } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); - } - - arch_enable_nonboot_cpus_end(); - - cpumask_clear(frozen_cpus); -out: - cpu_maps_update_done(); -} - -static int __init alloc_frozen_cpus(void) -{ - if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) - return -ENOMEM; - return 0; -} -core_initcall(alloc_frozen_cpus); - -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - -/* - * When callbacks for CPU hotplug notifications are being executed, we must - * ensure that the state of the system with respect to the tasks being frozen - * or not, as reported by the notification, remains unchanged *throughout the - * duration* of the execution of the callbacks. - * Hence we need to prevent the freezer from racing with regular CPU hotplug. - * - * This synchronization is implemented by mutually excluding regular CPU - * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ - * Hibernate notifications. - */ -static int -cpu_hotplug_pm_callback(struct notifier_block *nb, - unsigned long action, void *ptr) -{ - switch (action) { - - case PM_SUSPEND_PREPARE: - case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); - break; - - case PM_POST_SUSPEND: - case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - - -static int __init cpu_hotplug_pm_sync_init(void) -{ - pm_notifier(cpu_hotplug_pm_callback, 0); - return 0; -} -core_initcall(cpu_hotplug_pm_sync_init); - -#endif /* CONFIG_PM_SLEEP_SMP */ - -/** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started - * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). - */ -void __cpuinit notify_cpu_starting(unsigned int cpu) -{ - unsigned long val = CPU_STARTING; - -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); -} - -#endif /* CONFIG_SMP */ - -/* - * cpu_bit_bitmap[] is a special, "compressed" data structure that - * represents all NR_CPUS bits binary values of 1< 32 - MASK_DECLARE_8(32), MASK_DECLARE_8(40), - MASK_DECLARE_8(48), MASK_DECLARE_8(56), -#endif -}; -EXPORT_SYMBOL_GPL(cpu_bit_bitmap); - -const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; -EXPORT_SYMBOL(cpu_all_bits); - -#ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; -#else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; -#endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); -EXPORT_SYMBOL(cpu_possible_mask); - -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); -EXPORT_SYMBOL(cpu_online_mask); - -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); -EXPORT_SYMBOL(cpu_present_mask); - -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); -EXPORT_SYMBOL(cpu_active_mask); - -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); -} - -void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); -} - -void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); -} - -void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); -} - -void init_cpu_present(const struct cpumask *src) -{ - cpumask_copy(to_cpumask(cpu_present_bits), src); -} - -void init_cpu_possible(const struct cpumask *src) -{ - cpumask_copy(to_cpumask(cpu_possible_bits), src); -} - -void init_cpu_online(const struct cpumask *src) -{ - cpumask_copy(to_cpumask(cpu_online_bits), src); -} -/* - * Copyright (C) 2011 Google, Inc. - * - * Author: - * Colin Cross - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include - -static DEFINE_RWLOCK(cpu_pm_notifier_lock); -static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); - -static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) -{ - int ret; - - ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, - nr_to_call, nr_calls); - - return notifier_to_errno(ret); -} - -/** - * cpu_pm_register_notifier - register a driver with cpu_pm - * @nb: notifier block to register - * - * Add a driver to a list of drivers that are notified about - * CPU and CPU cluster low power entry and exit. - * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. - */ -int cpu_pm_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - write_lock_irqsave(&cpu_pm_notifier_lock, flags); - ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); - write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); - -/** - * cpu_pm_unregister_notifier - unregister a driver with cpu_pm - * @nb: notifier block to be unregistered - * - * Remove a driver from the CPU PM notifier list. - * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. - */ -int cpu_pm_unregister_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - write_lock_irqsave(&cpu_pm_notifier_lock, flags); - ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); - write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); - -/** - * cpm_pm_enter - CPU low power entry notifier - * - * Notifies listeners that a single CPU is entering a low power state that may - * cause some blocks in the same power domain as the cpu to reset. - * - * Must be called on the affected CPU with interrupts disabled. Platform is - * responsible for ensuring that cpu_pm_enter is not called twice on the same - * CPU before cpu_pm_exit is called. Notified drivers can include VFP - * co-processor, interrupt controller and it's PM extensions, local CPU - * timers context save/restore which shouldn't be interrupted. Hence it - * must be called with interrupts disabled. - * - * Return conditions are same as __raw_notifier_call_chain. - */ -int cpu_pm_enter(void) -{ - int nr_calls; - int ret = 0; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU PM - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_pm_enter); - -/** - * cpm_pm_exit - CPU low power exit notifier - * - * Notifies listeners that a single CPU is exiting a low power state that may - * have caused some blocks in the same power domain as the cpu to reset. - * - * Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which - * shouldn't be interrupted. Hence it must be called with interrupts disabled. - * - * Return conditions are same as __raw_notifier_call_chain. - */ -int cpu_pm_exit(void) -{ - int ret; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_pm_exit); - -/** - * cpm_cluster_pm_enter - CPU cluster low power entry notifier - * - * Notifies listeners that all cpus in a power domain are entering a low power - * state that may cause some blocks in the same power domain to reset. - * - * Must be called after cpu_pm_enter has been called on all cpus in the power - * domain, and before cpu_pm_exit has been called on any cpu in the power - * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which - * shouldn't be interrupted. Hence it must be called with interrupts disabled. - * - * Must be called with interrupts disabled. - * - * Return conditions are same as __raw_notifier_call_chain. - */ -int cpu_cluster_pm_enter(void) -{ - int nr_calls; - int ret = 0; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); - if (ret) - /* - * Inform listeners (nr_calls - 1) about failure of CPU cluster - * PM entry who are notified earlier to prepare for it. - */ - cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); - -/** - * cpm_cluster_pm_exit - CPU cluster low power exit notifier - * - * Notifies listeners that all cpus in a power domain are exiting form a - * low power state that may have caused some blocks in the same power domain - * to reset. - * - * Must be called after cpu_pm_exit has been called on all cpus in the power - * domain, and before cpu_pm_exit has been called on any cpu in the power - * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which - * shouldn't be interrupted. Hence it must be called with interrupts disabled. - * - * Return conditions are same as __raw_notifier_call_chain. - */ -int cpu_cluster_pm_exit(void) -{ - int ret; - - read_lock(&cpu_pm_notifier_lock); - ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); - read_unlock(&cpu_pm_notifier_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); - -#ifdef CONFIG_PM -static int cpu_pm_suspend(void) -{ - int ret; - - ret = cpu_pm_enter(); - if (ret) - return ret; - - ret = cpu_cluster_pm_enter(); - return ret; -} - -static void cpu_pm_resume(void) -{ - cpu_cluster_pm_exit(); - cpu_pm_exit(); -} - -static struct syscore_ops cpu_pm_syscore_ops = { - .suspend = cpu_pm_suspend, - .resume = cpu_pm_resume, -}; - -static int cpu_pm_init(void) -{ - register_syscore_ops(&cpu_pm_syscore_ops); - return 0; -} -core_initcall(cpu_pm_init); -#endif -/* - * kernel/cpuset.c - * - * Processor and Memory placement constraints for sets of tasks. - * - * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004-2007 Silicon Graphics, Inc. - * Copyright (C) 2006 Google, Inc - * - * Portions derived from Patrick Mochel's sysfs code. - * sysfs is Copyright (c) 2001-3 Patrick Mochel - * - * 2003-10-10 Written by Simon Derr. - * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson. - * 2006 Rework by Paul Menage to use generic cgroups - * 2008 Rework of the scheduler domains and CPU hotplug handling - * by Max Krasnyansky - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of the Linux - * distribution for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; - -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; -struct cpuset; - -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - - struct cpuset *parent; /* my parent */ - - struct fmeter fmeter; /* memory_pressure filter */ - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; -}; - -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cont) -{ - return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), - struct cpuset, css); -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return container_of(task_subsys_state(task, cpuset_subsys_id), - struct cpuset, css); -} - -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - -/* bits in struct cpuset flags field */ -typedef enum { - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - -static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), -}; - -/* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_mutex, then it has read-only - * access to cpusets. - * - * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. - * - * The cpuset_common_file_read() handlers only hold callback_mutex across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - * - * Accessing a task's cpuset should be done in accordance with the - * guidelines for accessing subsystem state in kernel/cgroup.c - */ - -static DEFINE_MUTEX(callback_mutex); - -/* - * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist - * buffers. They are statically allocated to prevent using excess stack - * when calling cpuset_print_task_mems_allowed(). - */ -#define CPUSET_NAME_LEN (128) -#define CPUSET_NODELIST_LEN (256) -static char cpuset_name[CPUSET_NAME_LEN]; -static char cpuset_nodelist[CPUSET_NODELIST_LEN]; -static DEFINE_SPINLOCK(cpuset_buffer_lock); - -/* - * This is ugly, but preserves the userspace API for existing cpuset - * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead - */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); - } - return ret; -} - -static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, -}; - -/* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. - * - * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. - * - * Call with callback_mutex held. - */ - -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) -{ - while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; - if (cs) - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); - else - cpumask_copy(pmask, cpu_online_mask); - BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); -} - -/* - * Return in *pmask the portion of a cpusets's mems_allowed that - * are online, with memory. If none are online with memory, walk - * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_HIGH_MEMORY]. - * - * One way or another, we guarantee to return some non-empty subset - * of node_states[N_HIGH_MEMORY]. - * - * Call with callback_mutex held. - */ - -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) -{ - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_HIGH_MEMORY])) - cs = cs->parent; - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_HIGH_MEMORY]); - else - *pmask = node_states[N_HIGH_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); -} - -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Called with callback_mutex/cgroup_mutex held - */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, - struct task_struct *tsk) -{ - if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; - else - tsk->flags &= ~PF_SPREAD_PAGE; - if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; - else - tsk->flags &= ~PF_SPREAD_SLAB; -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - -/** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates - */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) -{ - struct cpuset *trial; - - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); - if (!trial) - return NULL; - - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { - kfree(trial); - return NULL; - } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - - return trial; -} - -/** - * free_trial_cpuset - free the trial cpuset - * @trial: the trial cpuset to be freed - */ -static void free_trial_cpuset(struct cpuset *trial) -{ - free_cpumask_var(trial->cpus_allowed); - kfree(trial); -} - -/* - * validate_change() - Used to validate that any proposed cpuset change - * follows the structural rules for cpusets. - * - * If we replaced the flag and mask values of the current cpuset - * (cur) with those values in the trial cpuset (trial), would - * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. - * - * 'cur' is the address of an actual, in-use cpuset. Operations - * such as list traversal that depend on the actual address of the - * cpuset in the list must use cur below, not trial. - * - * 'trial' is the address of bulk structure copy of cur, with - * perhaps one or more of the fields cpus_allowed, mems_allowed, - * or flags changed to new, trial values. - * - * Return 0 if valid, -errno if not. - */ - -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) -{ - struct cgroup *cont; - struct cpuset *c, *par; - - /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } - - /* Remaining checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; - - par = cur->parent; - - /* We must be a subset of our parent cpuset */ - if (!is_cpuset_subset(trial, par)) - return -EACCES; - - /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; - } - - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } - - return 0; -} - -#ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) -{ - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } -} - -/* - * generate_sched_domains() - * - * This function builds a partial partition of the systems CPUs - * A 'partial partition' is a set of non-overlapping subsets whose - * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c - * partition_sched_domains() routine, which will rebuild the scheduler's - * load balancing domains (sched domains) as specified by that partial - * partition. - * - * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt - * for a background explanation of this. - * - * Does not return errors, on the theory that the callers of this - * routine would rather not worry about failures to rebuild sched - * domains when operating in the severe memory shortage situations - * that could cause allocation failures below. - * - * Must be called with cgroup_lock held. - * - * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. - * csa - (for CpuSet Array) Array of pointers to all the cpusets - * that need to be load balanced, for convenient iterative - * access by the subsequent code that finds the best partition, - * i.e the set of domains (subsets) of CPUs such that the - * cpus_allowed of every cpuset marked is_sched_load_balance - * is a subset of one of these domains, while there are as - * many such domains as possible, each as small as possible. - * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a - * convenient format, that can be easily compared to the prior - * value to determine what partition elements (sched domains) - * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. - * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). - */ -static int generate_sched_domains(cpumask_var_t **domains, - struct sched_domain_attr **attributes) -{ - LIST_HEAD(q); /* queue of cpusets to be scanned */ - struct cpuset *cp; /* scans q */ - struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ - cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - struct sched_domain_attr *dattr; /* attributes for custom domains */ - int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ - - doms = NULL; - dattr = NULL; - csa = NULL; - - /* Special case for the 99% of systems with one, full, sched domain */ - if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_copy(doms[0], top_cpuset.cpus_allowed); - - goto done; - } - - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); - if (!csa) - goto done; - csn = 0; - - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - - /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. - */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; - continue; - } - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } - - for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; - -restart: - /* Find the best partition (set of sched domains) */ - for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; - } - } - } - - /* - * Now we know how many domains to create. - * Convert to and populate cpu masks. - */ - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - /* - * The rest of the code, including the scheduler, can deal with - * dattr==NULL case. No need to abort if alloc fails. - */ - dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); - - for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; - } - - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->cpus_allowed); - if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; - } - } - nslot++; - } - BUG_ON(nslot != ndoms); - -done: - kfree(csa); - - /* - * Fallback to the default domain if kmalloc() failed. - * See comments in partition_sched_domains(). - */ - if (doms == NULL) - ndoms = 1; - - *domains = doms; - *attributes = dattr; - return ndoms; -} - -/* - * Rebuild scheduler domains. - * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). - * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. - */ -static void do_rebuild_sched_domains(struct work_struct *unused) -{ - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - get_online_cpus(); - - /* Generate domain masks and attrs */ - cgroup_lock(); - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); - - put_online_cpus(); -} -#else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) -{ -} - -static int generate_sched_domains(cpumask_var_t **domains, - struct sched_domain_attr **attributes) -{ - *domains = NULL; - return 1; -} -#endif /* CONFIG_SMP */ - -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ -void rebuild_sched_domains(void) -{ - do_rebuild_sched_domains(NULL); -} - -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Call with cgroup_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). - */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - return !cpumask_equal(&tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); -} - -/** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task - * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); -} - -/** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. - * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() - * - * Called with cgroup_mutex held - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. - */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) -{ - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); -} - -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) -{ - struct ptr_heap heap; - int retval; - int is_load_balanced; - - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; - - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; - - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) - return -EINVAL; - } - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; - - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; - - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - - is_load_balanced = is_sched_load_balance(trialcs); - - mutex_lock(&callback_mutex); - cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - mutex_unlock(&callback_mutex); - - /* - * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. - */ - update_tasks_cpumask(cs, &heap); - - heap_free(&heap); - - if (is_load_balanced) - async_rebuild_sched_domains(); - return 0; -} - -/* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * Call holding cgroup_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. - */ - -static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, - const nodemask_t *to) -{ - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); - - guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); -} - -/* - * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy - * @tsk: the task to change - * @newmems: new nodes that the task will be set - * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. - */ -static void cpuset_change_task_nodemask(struct task_struct *tsk, - nodemask_t *newmems) -{ - bool need_loop; - -repeat: - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * get_mems_allowed(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); - - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); - tsk->mems_allowed = *newmems; - task_unlock(tsk); -} - -/* - * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy - * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. - */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) -{ - struct mm_struct *mm; - struct cpuset *cs; - int migrate; - const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ - - cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); - - cpuset_change_task_nodemask(p, &newmems); - - mm = get_task_mm(p); - if (!mm) - return; - - migrate = is_memory_migrate(cs); - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); - mmput(mm); -} - -static void *cpuset_being_rebound; - -/** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. - * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @oldmem: old mems_allowed of cpuset cs - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() - * - * Called with cgroup_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. - */ -static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, - struct ptr_heap *heap) -{ - struct cgroup_scanner scan; - - cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = (nodemask_t *)oldmem; - - /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't - * take while holding tasklist_lock. Forks can happen - the - * mpol_dup() cpuset_being_rebound check will catch such forks, - * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort - * will be contending for the global variable cpuset_being_rebound. - * It's ok if we rebind the same mm twice; mpol_rebind_mm() - * is idempotent. Also migrate pages in each mm to new nodes. - */ - cgroup_scan_tasks(&scan); - - /* We're done rebinding vmas to this cpuset's new mems_allowed. */ - cpuset_being_rebound = NULL; -} - -/* - * Handle user request to change the 'mems' memory placement - * of a cpuset. Needs to validate the request, update the - * cpusets mems_allowed, and for each task in the cpuset, - * update mems_allowed and rebind task's mempolicy and any vma - * mempolicies and if the cpuset is marked 'memory_migrate', - * migrate the tasks pages to the new memory. - * - * Call with cgroup_mutex held. May take callback_mutex during call. - * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind - * their mempolicies to the cpusets new mems_allowed. - */ -static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) -{ - NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); - int retval; - struct ptr_heap heap; - - if (!oldmem) - return -ENOMEM; - - /* - * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* - * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. - */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; - - if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) { - retval = -EINVAL; - goto done; - } - } - *oldmem = cs->mems_allowed; - if (nodes_equal(*oldmem, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } - retval = validate_change(cs, trialcs); - if (retval < 0) - goto done; - - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval < 0) - goto done; - - mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs->mems_allowed; - mutex_unlock(&callback_mutex); - - update_tasks_nodemask(cs, oldmem, &heap); - - heap_free(&heap); -done: - NODEMASK_FREE(oldmem); - return retval; -} - -int current_cpuset_is_being_rebound(void) -{ - return task_cs(current) == cpuset_being_rebound; -} - -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val >= sched_domain_level_max) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - async_rebuild_sched_domains(); - } - - return 0; -} - -/* - * cpuset_change_flag - make a task's spread flags the same as its cpuset's - * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. - */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); -} - -/* - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() - * - * Called with cgroup_mutex held - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. - */ -static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) -{ - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); -} - -/* - * update_flag - read a 0 or a 1 in a file and update associated flag - * bit: the bit to update (see cpuset_flagbits_t) - * cs: the cpuset to update - * turning_on: whether the flag is being set or cleared - * - * Call with cgroup_mutex held. - */ - -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on) -{ - struct cpuset *trialcs; - int balance_flag_changed; - int spread_flag_changed; - struct ptr_heap heap; - int err; - - trialcs = alloc_trial_cpuset(cs); - if (!trialcs) - return -ENOMEM; - - if (turning_on) - set_bit(bit, &trialcs->flags); - else - clear_bit(bit, &trialcs->flags); - - err = validate_change(cs, trialcs); - if (err < 0) - goto out; - - err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (err < 0) - goto out; - - balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(trialcs)); - - spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) - || (is_spread_page(cs) != is_spread_page(trialcs))); - - mutex_lock(&callback_mutex); - cs->flags = trialcs->flags; - mutex_unlock(&callback_mutex); - - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); - - if (spread_flag_changed) - update_tasks_flags(cs, &heap); - heap_free(&heap); -out: - free_trial_cpuset(trialcs); - return err; -} - -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time_t now = get_seconds(); - time_t ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) -{ - struct cpuset *cs = cgroup_cs(cgrp); - struct task_struct *task; - int ret; - - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; - - cgroup_taskset_for_each(task, cgrp, tset) { - /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. - */ - if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; - } - - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - - return 0; -} - -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) -{ - struct mm_struct *mm; - struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *oldcs = cgroup_cs(oldcgrp); - - cgroup_taskset_for_each(task, cgrp, tset) { - /* - * can_attach beforehand should guarantee that this doesn't - * fail. TODO: have a better way to handle failure here - */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - - cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); - } - - /* - * Change mm, possibly for multiple threads in a threadgroup. This is - * expensive and may sleep. - */ - cpuset_attach_nodemask_from = oldcs->mems_allowed; - cpuset_attach_nodemask_to = cs->mems_allowed; - mm = get_task_mm(leader); - if (mm) { - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, - &cpuset_attach_nodemask_to); - mmput(mm); - } -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) -{ - int retval = 0; - struct cpuset *cs = cgroup_cs(cgrp); - cpuset_filetype_t type = cft->private; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } - cgroup_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) -{ - int retval = 0; - struct cpuset *cs = cgroup_cs(cgrp); - cpuset_filetype_t type = cft->private; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } - cgroup_unlock(); - return retval; -} - -/* - * Common handling for a write to a "cpus" or "mems" file. - */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) -{ - int retval = 0; - struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *trialcs; - - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - - trialcs = alloc_trial_cpuset(cs); - if (!trialcs) { - retval = -ENOMEM; - goto out; - } - - switch (cft->private) { - case FILE_CPULIST: - retval = update_cpumask(cs, trialcs, buf); - break; - case FILE_MEMLIST: - retval = update_nodemask(cs, trialcs, buf); - break; - default: - retval = -EINVAL; - break; - } - - free_trial_cpuset(trialcs); -out: - cgroup_unlock(); - return retval; -} - -/* - * These ascii lists should be read in a single call, by using a user - * buffer large enough to hold the entire map. If read in smaller - * chunks, there is no guarantee of atomicity. Since the display format - * used, list of ranges of sequential numbers, is variable length, - * and since these maps can change value dynamically, one could read - * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. - */ - -static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) -{ - size_t count; - - mutex_lock(&callback_mutex); - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - size_t count; - - mutex_lock(&callback_mutex); - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static ssize_t cpuset_common_file_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; - - switch (type) { - case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); - break; - case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); - break; - default: - retval = -EINVAL; - goto out; - } - *s++ = '\n'; - - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; -} - -static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unrechable but makes gcc happy */ - return 0; -} - - -/* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype files[] = { - { - .name = "cpus", - .read = cpuset_common_file_read, - .write_string = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .read = cpuset_common_file_read, - .write_string = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - .mode = S_IRUGO, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, -}; - -static struct cftype cft_memory_pressure_enabled = { - .name = "memory_pressure_enabled", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, -}; - -static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - int err; - - err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); - if (err) - return err; - /* memory_pressure_enabled is in root cpuset only */ - if (!cont->parent) - err = cgroup_add_file(cont, ss, - &cft_memory_pressure_enabled); - return err; -} - -/* - * post_clone() is called during cgroup_create() when the - * clone_children mount argument was specified. The cgroup - * can not yet have any tasks. - * - * Currently we refuse to set up the cgroup - thereby - * refusing the task to be entered, and as a result refusing - * the sys_unshare() or clone() which initiated it - if any - * sibling cpusets have exclusive cpus or mem. - * - * If this becomes a problem for some users who wish to - * allow that scenario, then cpuset_post_clone() could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. Called with cgroup_mutex - * held. - */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct cgroup *parent, *child; - struct cpuset *cs, *parent_cs; - - parent = cgroup->parent; - list_for_each_entry(child, &parent->children, sibling) { - cs = cgroup_cs(child); - if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) - return; - } - cs = cgroup_cs(cgroup); - parent_cs = cgroup_cs(parent); - - mutex_lock(&callback_mutex); - cs->mems_allowed = parent_cs->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); - mutex_unlock(&callback_mutex); - return; -} - -/* - * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem - * cont: control group that the new cpuset will be part of - */ - -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cpuset *cs; - struct cpuset *parent; - - if (!cont->parent) { - return &top_cpuset.css; - } - parent = cgroup_cs(cont->parent); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) - return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpumask_clear(cs->cpus_allowed); - nodes_clear(cs->mems_allowed); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; - - cs->parent = parent; - number_of_cpusets++; - return &cs->css ; -} - -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). - */ - -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - struct cpuset *cs = cgroup_cs(cont); - - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; - free_cpumask_var(cs->cpus_allowed); - kfree(cs); -} - -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", - .create = cpuset_create, - .destroy = cpuset_destroy, - .can_attach = cpuset_can_attach, - .attach = cpuset_attach, - .populate = cpuset_populate, - .post_clone = cpuset_post_clone, - .subsys_id = cpuset_subsys_id, - .early_init = 1, -}; - -/** - * cpuset_init - initialize cpusets at system boot - * - * Description: Initialize top_cpuset and the cpuset internal file system, - **/ - -int __init cpuset_init(void) -{ - int err = 0; - - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - - cpumask_setall(top_cpuset.cpus_allowed); - nodes_setall(top_cpuset.mems_allowed); - - fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; - - err = register_filesystem(&cpuset_fs_type); - if (err < 0) - return err; - - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); - - number_of_cpusets = 1; - return 0; -} - -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - struct cgroup *new_cgroup = scan->data; - - cgroup_attach_task(new_cgroup, tsk); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ - struct cgroup_scanner scan; - - scan.cg = from->css.cgroup; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cpuset_do_move_task; - scan.heap = NULL; - scan.data = to->css.cgroup; - - if (cgroup_scan_tasks(&scan)) - printk(KERN_ERR "move_member_tasks_to_cpuset: " - "cgroup_scan_tasks failed\n"); -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = cs->parent; - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent->parent; - - move_member_tasks_to_cpuset(cs, parent); -} - -/* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. - * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. - * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. - * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. - */ -static void scan_for_empty_cpusets(struct cpuset *root) -{ - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - list_add_tail((struct list_head *)&root->stack_list, &queue); - - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); - } - - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; - - oldmems = cp->mems_allowed; - - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); - - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); - } - } -} - -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - */ -void cpuset_update_active_cpus(void) -{ - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). - */ -static int cpuset_track_online_nodes(struct notifier_block *self, - unsigned long action, void *arg) -{ - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. - */ - scan_for_empty_cpusets(&top_cpuset); - break; - default: - break; - } - cgroup_unlock(); - - return NOTIFY_OK; -} -#endif - -/** - * cpuset_init_smp - initialize cpus_allowed - * - * Description: Finish top cpuset after cpu, node maps are initialized - **/ - -void __init cpuset_init_smp(void) -{ - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - - hotplug_memory_notifier(cpuset_track_online_nodes, 10); - - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); -} - -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_var_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the - * tasks cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) -{ - mutex_lock(&callback_mutex); - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); - task_unlock(tsk); - mutex_unlock(&callback_mutex); -} - -int cpuset_cpus_allowed_fallback(struct task_struct *tsk) -{ - const struct cpuset *cs; - int cpu; - - rcu_read_lock(); - cs = task_cs(tsk); - if (cs) - do_set_cpus_allowed(tsk, cs->cpus_allowed); - rcu_read_unlock(); - - /* - * We own tsk->cpus_allowed, nobody can change it under us. - * - * But we used cs && cs->cpus_allowed lockless and thus can - * race with cgroup_attach_task() or update_cpumask() and get - * the wrong tsk->cpus_allowed. However, both cases imply the - * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() - * which takes task_rq_lock(). - * - * If we are called after it dropped the lock we must see all - * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary - * set any mask even if it is not right from task_cs() pov, - * the pending set_cpus_allowed_ptr() will fix things. - */ - - cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); - if (cpu >= nr_cpu_ids) { - /* - * Either tsk->cpus_allowed is wrong (see above) or it - * is actually empty. The latter case is only possible - * if we are racing with remove_tasks_in_empty_cpuset(). - * Like above we can temporary set any mask and rely on - * set_cpus_allowed_ptr() as synchronization point. - */ - do_set_cpus_allowed(tsk, cpu_possible_mask); - cpu = cpumask_any(cpu_active_mask); - } - - return cpu; -} - -void cpuset_init_current_mems_allowed(void) -{ - nodes_setall(current->mems_allowed); -} - -/** - * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. - * - * Description: Returns the nodemask_t mems_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of node_states[N_HIGH_MEMORY], even if this means going outside the - * tasks cpuset. - **/ - -nodemask_t cpuset_mems_allowed(struct task_struct *tsk) -{ - nodemask_t mask; - - mutex_lock(&callback_mutex); - task_lock(tsk); - guarantee_online_mems(task_cs(tsk), &mask); - task_unlock(tsk); - mutex_unlock(&callback_mutex); - - return mask; -} - -/** - * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed - * @nodemask: the nodemask to be checked - * - * Are any of the nodes in the nodemask allowed in current->mems_allowed? - */ -int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) -{ - return nodes_intersects(*nodemask, current->mems_allowed); -} - -/* - * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or - * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall - * (an unusual configuration), then returns the root cpuset. - */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) -{ - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; - return cs; -} - -/** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest - * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been - * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE - * flag, yes. - * Otherwise, no. - * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall - * cpusets, and never sleeps. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * GFP_USER allocations are marked with the __GFP_HARDWALL bit, - * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. - * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest enclosing hardwalled ancestor cpuset. - * - * Scanning up parent cpusets requires callback_mutex. The - * __alloc_pages() routine only calls here with __GFP_HARDWALL bit - * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the - * current tasks mems_allowed came up empty on the first pass over - * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. - * - * The first call here from mm/page_alloc:get_page_from_freelist() - * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, - * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). - * - * The second pass through get_page_from_freelist() doesn't even call - * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() - * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set - * in alloc_flags. That logic and the checks below have the combined - * affect that: - * in_interrupt - any node ok (current task context irrelevant) - * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok - * GFP_KERNEL - any node in enclosing hardwalled cpuset ok - * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. - */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) -{ - const struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ - - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; - - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; - - /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); - - task_lock(current); - cs = nearest_hardwall_ancestor(task_cs(current)); - task_unlock(current); - - allowed = node_isset(node, cs->mems_allowed); - mutex_unlock(&callback_mutex); - return allowed; -} - -/* - * cpuset_node_allowed_hardwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If the task has been OOM killed and has access to memory reserves as - * specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_node_allowed_softwall() variant, above, - * this variant requires that the node be in the current task's - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ -int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - -/** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - -/** - * cpuset_mem_spread_node() - On which node to begin search for a file page - * cpuset_slab_spread_node() - On which node to begin search for a slab page - * - * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for - * tasks in a cpuset with is_spread_page or is_spread_slab set), - * and if the memory allocation used cpuset_mem_spread_node() - * to determine on which node to start looking, as it will for - * certain page cache or slab cache pages such as used for file - * system buffers and inode caches, then instead of starting on the - * local node to look for a free page, rather spread the starting - * node around the tasks mems_allowed nodes. - * - * We don't have to worry about the returned node being offline - * because "it can't happen", and even if it did, it would be ok. - * - * The routines calling guarantee_online_mems() are careful to - * only set nodes in task->mems_allowed that are online. So it - * should not be possible for the following code to return an - * offline node. But if it did, that would be ok, as this routine - * is not returning the node where the allocation must be, only - * the node where the search should start. The zonelist passed to - * __alloc_pages() will include all nodes. If the slab allocator - * is passed an offline node, it will fall back to the local node. - * See kmem_cache_alloc_node(). - */ - -static int cpuset_spread_node(int *rotor) -{ - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; -} - -int cpuset_mem_spread_node(void) -{ - if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) - current->cpuset_mem_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); -} - -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} - -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** - * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? - * @tsk1: pointer to task_struct of some task. - * @tsk2: pointer to task_struct of some other task. - * - * Description: Return true if @tsk1's mems_allowed intersects the - * mems_allowed of @tsk2. Used by the OOM killer to determine if - * one of the task's memory usage might impact the memory available - * to the other. - **/ - -int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) -{ - return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); -} - -/** - * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. - * - * Description: Prints @task's name, cpuset name, and cached copy of its - * mems_allowed to the kernel log. Must hold task_lock(task) to allow - * dereferencing task_cs(task). - */ -void cpuset_print_task_mems_allowed(struct task_struct *tsk) -{ - struct dentry *dentry; - - dentry = task_cs(tsk)->css.cgroup->dentry; - spin_lock(&cpuset_buffer_lock); - snprintf(cpuset_name, CPUSET_NAME_LEN, - dentry ? (const char *)dentry->d_name.name : "/"); - nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, - tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cpuset_name, cpuset_nodelist); - spin_unlock(&cpuset_buffer_lock); -} - -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - **/ - -void __cpuset_memory_pressure_bump(void) -{ - task_lock(current); - fmeter_markevent(&task_cs(current)->fmeter); - task_unlock(current); -} - -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc//cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -static int proc_cpuset_show(struct seq_file *m, void *unused_v) -{ - struct pid *pid; - struct task_struct *tsk; - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) - goto out; - - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = -EINVAL; - cgroup_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); - if (retval < 0) - goto out_unlock; - seq_puts(m, buf); - seq_putc(m, '\n'); -out_unlock: - cgroup_unlock(); - put_task_struct(tsk); -out_free: - kfree(buf); -out: - return retval; -} - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROC_PID_CPUSET */ - -/* Display task mems_allowed in /proc//status file. */ -void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) -{ - seq_printf(m, "Mems_allowed:\t"); - seq_nodemask(m, &task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); - seq_nodemask_list(m, &task->mems_allowed); - seq_printf(m, "\n"); -} -#include -#include -#include -#include -#include - -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - -/* - * stores the physical address of elf header of crash image - * - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence put - * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - -/* - * stores the size of elf header of crash image - */ -unsigned long long elfcorehdr_size; - -/* - * elfcorehdr= specifies the location of elf core header stored by the crashed - * kernel. This option will be passed by kexec loader to the capture kernel. - * - * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - if (*end == '@') { - elfcorehdr_size = elfcorehdr_addr; - elfcorehdr_addr = memparse(end + 1, &end); - } - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -/* Task credentials management - see Documentation/security/credentials.txt - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if 0 -#define kdebug(FMT, ...) \ - printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) -#else -#define kdebug(FMT, ...) \ - no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) -#endif - -static struct kmem_cache *cred_jar; - -/* - * The common credentials for the initial task's thread group - */ -#ifdef CONFIG_KEYS -static struct thread_group_cred init_tgcred = { - .usage = ATOMIC_INIT(2), - .tgid = 0, - .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), -}; -#endif - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), -#ifdef CONFIG_DEBUG_CREDENTIALS - .subscribers = ATOMIC_INIT(2), - .magic = CRED_MAGIC, -#endif - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, -#ifdef CONFIG_KEYS - .tgcred = &init_tgcred, -#endif -}; - -static inline void set_cred_subscribers(struct cred *cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_set(&cred->subscribers, n); -#endif -} - -static inline int read_cred_subscribers(const struct cred *cred) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - return atomic_read(&cred->subscribers); -#else - return 0; -#endif -} - -static inline void alter_cred_subscribers(const struct cred *_cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - struct cred *cred = (struct cred *) _cred; - - atomic_add(n, &cred->subscribers); -#endif -} - -/* - * Dispose of the shared task group credentials - */ -#ifdef CONFIG_KEYS -static void release_tgcred_rcu(struct rcu_head *rcu) -{ - struct thread_group_cred *tgcred = - container_of(rcu, struct thread_group_cred, rcu); - - BUG_ON(atomic_read(&tgcred->usage) != 0); - - key_put(tgcred->session_keyring); - key_put(tgcred->process_keyring); - kfree(tgcred); -} -#endif - -/* - * Release a set of thread group credentials. - */ -static void release_tgcred(struct cred *cred) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = cred->tgcred; - - if (atomic_dec_and_test(&tgcred->usage)) - call_rcu(&tgcred->rcu, release_tgcred_rcu); -#endif -} - -/* - * The RCU callback to actually dispose of a set of credentials - */ -static void put_cred_rcu(struct rcu_head *rcu) -{ - struct cred *cred = container_of(rcu, struct cred, rcu); - - kdebug("put_cred_rcu(%p)", cred); - -#ifdef CONFIG_DEBUG_CREDENTIALS - if (cred->magic != CRED_MAGIC_DEAD || - atomic_read(&cred->usage) != 0 || - read_cred_subscribers(cred) != 0) - panic("CRED: put_cred_rcu() sees %p with" - " mag %x, put %p, usage %d, subscr %d\n", - cred, cred->magic, cred->put_addr, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); -#else - if (atomic_read(&cred->usage) != 0) - panic("CRED: put_cred_rcu() sees %p with usage %d\n", - cred, atomic_read(&cred->usage)); -#endif - - security_cred_free(cred); - key_put(cred->thread_keyring); - key_put(cred->request_key_auth); - release_tgcred(cred); - if (cred->group_info) - put_group_info(cred->group_info); - free_uid(cred->user); - kmem_cache_free(cred_jar, cred); -} - -/** - * __put_cred - Destroy a set of credentials - * @cred: The record to release - * - * Destroy a set of credentials on which no references remain. - */ -void __put_cred(struct cred *cred) -{ - kdebug("__put_cred(%p{%d,%d})", cred, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - - BUG_ON(atomic_read(&cred->usage) != 0); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(cred) != 0); - cred->magic = CRED_MAGIC_DEAD; - cred->put_addr = __builtin_return_address(0); -#endif - BUG_ON(cred == current->cred); - BUG_ON(cred == current->real_cred); - - call_rcu(&cred->rcu, put_cred_rcu); -} -EXPORT_SYMBOL(__put_cred); - -/* - * Clean up a task's credentials when it exits - */ -void exit_creds(struct task_struct *tsk) -{ - struct cred *cred; - - kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); - - cred = (struct cred *) tsk->real_cred; - tsk->real_cred = NULL; - validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); - - cred = (struct cred *) tsk->cred; - tsk->cred = NULL; - validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); - - cred = (struct cred *) tsk->replacement_session_keyring; - if (cred) { - tsk->replacement_session_keyring = NULL; - validate_creds(cred); - put_cred(cred); - } -} - -/** - * get_task_cred - Get another task's objective credentials - * @task: The task to query - * - * Get the objective credentials of a task, pinning them so that they can't go - * away. Accessing a task's credentials directly is not permitted. - * - * The caller must also make sure task doesn't get deleted, either by holding a - * ref on task or by holding tasklist_lock to prevent it from being unlinked. - */ -const struct cred *get_task_cred(struct task_struct *task) -{ - const struct cred *cred; - - rcu_read_lock(); - - do { - cred = __task_cred((task)); - BUG_ON(!cred); - } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); - - rcu_read_unlock(); - return cred; -} - -/* - * Allocate blank credentials, such that the credentials can be filled in at a - * later date without risk of ENOMEM. - */ -struct cred *cred_alloc_blank(void) -{ - struct cred *new; - - new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); - if (!new) - return NULL; - -#ifdef CONFIG_KEYS - new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); - if (!new->tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } - atomic_set(&new->tgcred->usage, 1); -#endif - - atomic_set(&new->usage, 1); -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif - - if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) - goto error; - - return new; - -error: - abort_creds(new); - return NULL; -} - -/** - * prepare_creds - Prepare a new set of credentials for modification - * - * Prepare a new set of task credentials for modification. A task's creds - * shouldn't generally be modified directly, therefore this function is used to - * prepare a new copy, which the caller then modifies and then commits by - * calling commit_creds(). - * - * Preparation involves making a copy of the objective creds for modification. - * - * Returns a pointer to the new creds-to-be if successful, NULL otherwise. - * - * Call commit_creds() or abort_creds() to clean up. - */ -struct cred *prepare_creds(void) -{ - struct task_struct *task = current; - const struct cred *old; - struct cred *new; - - validate_process_creds(); - - new = kmem_cache_alloc(cred_jar, GFP_KERNEL); - if (!new) - return NULL; - - kdebug("prepare_creds() alloc %p", new); - - old = task->cred; - memcpy(new, old, sizeof(struct cred)); - - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); - get_group_info(new->group_info); - get_uid(new->user); - -#ifdef CONFIG_KEYS - key_get(new->thread_keyring); - key_get(new->request_key_auth); - atomic_inc(&new->tgcred->usage); -#endif - -#ifdef CONFIG_SECURITY - new->security = NULL; -#endif - - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) - goto error; - validate_creds(new); - return new; - -error: - abort_creds(new); - return NULL; -} -EXPORT_SYMBOL(prepare_creds); - -/* - * Prepare credentials for current to perform an execve() - * - The caller must hold ->cred_guard_mutex - */ -struct cred *prepare_exec_creds(void) -{ - struct thread_group_cred *tgcred = NULL; - struct cred *new; - -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return NULL; -#endif - - new = prepare_creds(); - if (!new) { - kfree(tgcred); - return new; - } - -#ifdef CONFIG_KEYS - /* newly exec'd tasks don't get a thread keyring */ - key_put(new->thread_keyring); - new->thread_keyring = NULL; - - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; -#endif - - return new; -} - -/* - * Copy credentials for the new process created by fork() - * - * We share if we can, but under some circumstances we have to generate a new - * set. - * - * The new process gets the current process's subjective credentials as its - * objective and subjective credentials - */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif - struct cred *new; - int ret; - - if ( -#ifdef CONFIG_KEYS - !p->cred->thread_keyring && -#endif - clone_flags & CLONE_THREAD - ) { - p->real_cred = get_cred(p->cred); - get_cred(p->cred); - alter_cred_subscribers(p->cred, 2); - kdebug("share_creds(%p{%d,%d})", - p->cred, atomic_read(&p->cred->usage), - read_cred_subscribers(p->cred)); - atomic_inc(&p->cred->user->processes); - return 0; - } - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - if (clone_flags & CLONE_NEWUSER) { - ret = create_user_ns(new); - if (ret < 0) - goto error_put; - } - - /* cache user_ns in cred. Doesn't need a refcount because it will - * stay pinned by cred->user - */ - new->user_ns = new->user->user_ns; - -#ifdef CONFIG_KEYS - /* new threads get their own thread keyrings if their parent already - * had one */ - if (new->thread_keyring) { - key_put(new->thread_keyring); - new->thread_keyring = NULL; - if (clone_flags & CLONE_THREAD) - install_thread_keyring_to_cred(new); - } - - /* we share the process and session keyrings between all the threads in - * a process - this is slightly icky as we violate COW credentials a - * bit */ - if (!(clone_flags & CLONE_THREAD)) { - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - ret = -ENOMEM; - goto error_put; - } - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = key_get(new->tgcred->session_keyring); - - release_tgcred(new); - new->tgcred = tgcred; - } -#endif - - atomic_inc(&new->user->processes); - p->cred = p->real_cred = get_cred(new); - alter_cred_subscribers(new, 2); - validate_creds(new); - return 0; - -error_put: - put_cred(new); - return ret; -} - -/** - * commit_creds - Install new credentials upon the current task - * @new: The credentials to be assigned - * - * Install a new set of credentials to the current task, using RCU to replace - * the old set. Both the objective and the subjective credentials pointers are - * updated. This function may not be called if the subjective credentials are - * in an overridden state. - * - * This function eats the caller's reference to the new credentials. - * - * Always returns 0 thus allowing this function to be tail-called at the end - * of, say, sys_setgid(). - */ -int commit_creds(struct cred *new) -{ - struct task_struct *task = current; - const struct cred *old = task->real_cred; - - kdebug("commit_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); - - BUG_ON(task->cred != old); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(old) < 2); - validate_creds(old); - validate_creds(new); -#endif - BUG_ON(atomic_read(&new->usage) < 1); - - get_cred(new); /* we will require a ref for the subj creds too */ - - /* dumpability changes */ - if (old->euid != new->euid || - old->egid != new->egid || - old->fsuid != new->fsuid || - old->fsgid != new->fsgid || - !cap_issubset(new->cap_permitted, old->cap_permitted)) { - if (task->mm) - set_dumpable(task->mm, suid_dumpable); - task->pdeath_signal = 0; - smp_wmb(); - } - - /* alter the thread keyring */ - if (new->fsuid != old->fsuid) - key_fsuid_changed(task); - if (new->fsgid != old->fsgid) - key_fsgid_changed(task); - - /* do it - * RLIMIT_NPROC limits on user->processes have already been checked - * in set_user(). - */ - alter_cred_subscribers(new, 2); - if (new->user != old->user) - atomic_inc(&new->user->processes); - rcu_assign_pointer(task->real_cred, new); - rcu_assign_pointer(task->cred, new); - if (new->user != old->user) - atomic_dec(&old->user->processes); - alter_cred_subscribers(old, -2); - - /* send notifications */ - if (new->uid != old->uid || - new->euid != old->euid || - new->suid != old->suid || - new->fsuid != old->fsuid) - proc_id_connector(task, PROC_EVENT_UID); - - if (new->gid != old->gid || - new->egid != old->egid || - new->sgid != old->sgid || - new->fsgid != old->fsgid) - proc_id_connector(task, PROC_EVENT_GID); - - /* release the old obj and subj refs both */ - put_cred(old); - put_cred(old); - return 0; -} -EXPORT_SYMBOL(commit_creds); - -/** - * abort_creds - Discard a set of credentials and unlock the current task - * @new: The credentials that were going to be applied - * - * Discard a set of credentials that were under construction and unlock the - * current task. - */ -void abort_creds(struct cred *new) -{ - kdebug("abort_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); - -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(new) != 0); -#endif - BUG_ON(atomic_read(&new->usage) < 1); - put_cred(new); -} -EXPORT_SYMBOL(abort_creds); - -/** - * override_creds - Override the current process's subjective credentials - * @new: The credentials to be assigned - * - * Install a set of temporary override subjective credentials on the current - * process, returning the old set for later reversion. - */ -const struct cred *override_creds(const struct cred *new) -{ - const struct cred *old = current->cred; - - kdebug("override_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); - - validate_creds(old); - validate_creds(new); - get_cred(new); - alter_cred_subscribers(new, 1); - rcu_assign_pointer(current->cred, new); - alter_cred_subscribers(old, -1); - - kdebug("override_creds() = %p{%d,%d}", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); - return old; -} -EXPORT_SYMBOL(override_creds); - -/** - * revert_creds - Revert a temporary subjective credentials override - * @old: The credentials to be restored - * - * Revert a temporary set of override subjective credentials to an old set, - * discarding the override set. - */ -void revert_creds(const struct cred *old) -{ - const struct cred *override = current->cred; - - kdebug("revert_creds(%p{%d,%d})", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); - - validate_creds(old); - validate_creds(override); - alter_cred_subscribers(old, 1); - rcu_assign_pointer(current->cred, old); - alter_cred_subscribers(override, -1); - put_cred(override); -} -EXPORT_SYMBOL(revert_creds); - -/* - * initialise the credentials stuff - */ -void __init cred_init(void) -{ - /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} - -/** - * prepare_kernel_cred - Prepare a set of credentials for a kernel service - * @daemon: A userspace daemon to be used as a reference - * - * Prepare a set of credentials for a kernel service. This can then be used to - * override a task's own credentials so that work can be done on behalf of that - * task that requires a different subjective context. - * - * @daemon is used to provide a base for the security record, but can be NULL. - * If @daemon is supplied, then the security data will be derived from that; - * otherwise they'll be set to 0 and no groups, full capabilities and no keys. - * - * The caller may change these controls afterwards if desired. - * - * Returns the new credentials or NULL if out of memory. - * - * Does not take, and does not return holding current->cred_replace_mutex. - */ -struct cred *prepare_kernel_cred(struct task_struct *daemon) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif - const struct cred *old; - struct cred *new; - - new = kmem_cache_alloc(cred_jar, GFP_KERNEL); - if (!new) - return NULL; - -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } -#endif - - kdebug("prepare_kernel_cred() alloc %p", new); - - if (daemon) - old = get_task_cred(daemon); - else - old = get_cred(&init_cred); - - validate_creds(old); - - *new = *old; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); - get_uid(new->user); - get_group_info(new->group_info); - -#ifdef CONFIG_KEYS - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = NULL; - new->tgcred = tgcred; - new->request_key_auth = NULL; - new->thread_keyring = NULL; - new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; -#endif - -#ifdef CONFIG_SECURITY - new->security = NULL; -#endif - if (security_prepare_creds(new, old, GFP_KERNEL) < 0) - goto error; - - put_cred(old); - validate_creds(new); - return new; - -error: - put_cred(new); - put_cred(old); - return NULL; -} -EXPORT_SYMBOL(prepare_kernel_cred); - -/** - * set_security_override - Set the security ID in a set of credentials - * @new: The credentials to alter - * @secid: The LSM security ID to set - * - * Set the LSM security ID in a set of credentials so that the subjective - * security is overridden when an alternative set of credentials is used. - */ -int set_security_override(struct cred *new, u32 secid) -{ - return security_kernel_act_as(new, secid); -} -EXPORT_SYMBOL(set_security_override); - -/** - * set_security_override_from_ctx - Set the security ID in a set of credentials - * @new: The credentials to alter - * @secctx: The LSM security context to generate the security ID from. - * - * Set the LSM security ID in a set of credentials so that the subjective - * security is overridden when an alternative set of credentials is used. The - * security ID is specified in string form as a security context to be - * interpreted by the LSM. - */ -int set_security_override_from_ctx(struct cred *new, const char *secctx) -{ - u32 secid; - int ret; - - ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); - if (ret < 0) - return ret; - - return set_security_override(new, secid); -} -EXPORT_SYMBOL(set_security_override_from_ctx); - -/** - * set_create_files_as - Set the LSM file create context in a set of credentials - * @new: The credentials to alter - * @inode: The inode to take the context from - * - * Change the LSM file creation context in a set of credentials to be the same - * as the object context of the specified inode, so that the new inodes have - * the same MAC context as that inode. - */ -int set_create_files_as(struct cred *new, struct inode *inode) -{ - new->fsuid = inode->i_uid; - new->fsgid = inode->i_gid; - return security_kernel_create_files_as(new, inode); -} -EXPORT_SYMBOL(set_create_files_as); - -#ifdef CONFIG_DEBUG_CREDENTIALS - -bool creds_are_invalid(const struct cred *cred) -{ - if (cred->magic != CRED_MAGIC) - return true; -#ifdef CONFIG_SECURITY_SELINUX - /* - * cred->security == NULL if security_cred_alloc_blank() or - * security_prepare_creds() returned an error. - */ - if (selinux_is_enabled() && cred->security) { - if ((unsigned long) cred->security < PAGE_SIZE) - return true; - if ((*(u32 *)cred->security & 0xffffff00) == - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) - return true; - } -#endif - return false; -} -EXPORT_SYMBOL(creds_are_invalid); - -/* - * dump invalid credentials - */ -static void dump_invalid_creds(const struct cred *cred, const char *label, - const struct task_struct *tsk) -{ - printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", - label, cred, - cred == &init_cred ? "[init]" : "", - cred == tsk->real_cred ? "[real]" : "", - cred == tsk->cred ? "[eff]" : ""); - printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", - cred->magic, cred->put_addr); - printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", - cred->uid, cred->euid, cred->suid, cred->fsuid); - printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", - cred->gid, cred->egid, cred->sgid, cred->fsgid); -#ifdef CONFIG_SECURITY - printk(KERN_ERR "CRED: ->security is %p\n", cred->security); - if ((unsigned long) cred->security >= PAGE_SIZE && - (((unsigned long) cred->security & 0xffffff00) != - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - printk(KERN_ERR "CRED: ->security {%x, %x}\n", - ((u32*)cred->security)[0], - ((u32*)cred->security)[1]); -#endif -} - -/* - * report use of invalid credentials - */ -void __invalid_creds(const struct cred *cred, const char *file, unsigned line) -{ - printk(KERN_ERR "CRED: Invalid credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); - dump_invalid_creds(cred, "Specified", current); - BUG(); -} -EXPORT_SYMBOL(__invalid_creds); - -/* - * check the credentials on a process - */ -void __validate_process_creds(struct task_struct *tsk, - const char *file, unsigned line) -{ - if (tsk->cred == tsk->real_cred) { - if (unlikely(read_cred_subscribers(tsk->cred) < 2 || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } else { - if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || - read_cred_subscribers(tsk->cred) < 1 || - creds_are_invalid(tsk->real_cred) || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } - return; - -invalid_creds: - printk(KERN_ERR "CRED: Invalid process credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); - - dump_invalid_creds(tsk->real_cred, "Real", tsk); - if (tsk->cred != tsk->real_cred) - dump_invalid_creds(tsk->cred, "Effective", tsk); - else - printk(KERN_ERR "CRED: Effective creds == Real creds\n"); - BUG(); -} -EXPORT_SYMBOL(__validate_process_creds); - -/* - * check creds for do_exit() - */ -void validate_creds_for_do_exit(struct task_struct *tsk) -{ - kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", - tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); - - __validate_process_creds(tsk, __FILE__, __LINE__); -} - -#endif /* CONFIG_DEBUG_CREDENTIALS */ -/* - * Kernel Debug Core - * - * Maintainer: Jason Wessel - * - * Copyright (C) 2000-2001 VERITAS Software Corporation. - * Copyright (C) 2002-2004 Timesys Corporation - * Copyright (C) 2003-2004 Amit S. Kale - * Copyright (C) 2004 Pavel Machek - * Copyright (C) 2004-2006 Tom Rini - * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. - * Copyright (C) 2005-2009 Wind River Systems, Inc. - * Copyright (C) 2007 MontaVista Software, Inc. - * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar - * - * Contributors at various stages not listed above: - * Jason Wessel ( jason.wessel@windriver.com ) - * George Anzinger - * Anurekh Saxena (anurekh.saxena@timesys.com) - * Lake Stevens Instrument Division (Glenn Engel) - * Jim Kingdon, Cygnus Support. - * - * Original KGDB stub: David Grothe , - * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "debug_core.h" - -static int kgdb_break_asap; - -struct debuggerinfo_struct kgdb_info[NR_CPUS]; - -/** - * kgdb_connected - Is a host GDB connected to us? - */ -int kgdb_connected; -EXPORT_SYMBOL_GPL(kgdb_connected); - -/* All the KGDB handlers are installed */ -int kgdb_io_module_registered; - -/* Guard for recursive entry */ -static int exception_level; - -struct kgdb_io *dbg_io_ops; -static DEFINE_SPINLOCK(kgdb_registration_lock); - -/* kgdb console driver is loaded */ -static int kgdb_con_registered; -/* determine if kgdb console output should be used */ -static int kgdb_use_con; -/* Flag for alternate operations for early debugging */ -bool dbg_is_early = true; -/* Next cpu to become the master debug core */ -int dbg_switch_cpu; - -/* Use kdb or gdbserver mode */ -int dbg_kdb_mode = 1; - -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - -module_param(kgdb_use_con, int, 0644); - -/* - * Holds information about breakpoints in a kernel. These breakpoints are - * added and removed by gdb. - */ -static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { - [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } -}; - -/* - * The CPU# of the active CPU, or -1 if none: - */ -atomic_t kgdb_active = ATOMIC_INIT(-1); -EXPORT_SYMBOL_GPL(kgdb_active); -static DEFINE_RAW_SPINLOCK(dbg_master_lock); -static DEFINE_RAW_SPINLOCK(dbg_slave_lock); - -/* - * We use NR_CPUs not PERCPU, in case kgdb is used to debug early - * bootup code (which might not have percpu set up yet): - */ -static atomic_t masters_in_kgdb; -static atomic_t slaves_in_kgdb; -static atomic_t kgdb_break_tasklet_var; -atomic_t kgdb_setting_breakpoint; - -struct task_struct *kgdb_usethread; -struct task_struct *kgdb_contthread; - -int kgdb_single_step; -static pid_t kgdb_sstep_pid; - -/* to keep track of the CPU which is doing the single stepping*/ -atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); - -/* - * If you are debugging a problem where roundup (the collection of - * all other CPUs) is a problem [this should be extremely rare], - * then use the nokgdbroundup option to avoid roundup. In that case - * the other CPUs might interfere with your debugging context, so - * use this with care: - */ -static int kgdb_do_roundup = 1; - -static int __init opt_nokgdbroundup(char *str) -{ - kgdb_do_roundup = 0; - - return 0; -} - -early_param("nokgdbroundup", opt_nokgdbroundup); - -/* - * Finally, some KGDB code :-) - */ - -/* - * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: - */ -int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) -{ - int err; - - err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); - if (err) - return err; - - return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); -} - -int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) -{ - return probe_kernel_write((char *)addr, - (char *)bundle, BREAK_INSTR_SIZE); -} - -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - int err; - /* Validate setting the breakpoint and then removing it. In the - * remove fails, the kernel needs to emit a bad message because we - * are deep trouble not being able to put things back the way we - * found them. - */ - err = kgdb_arch_set_breakpoint(addr, tmp_variable); - if (err) - return err; - err = kgdb_arch_remove_breakpoint(addr, tmp_variable); - if (err) - printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " - "memory destroyed at: %lx", addr); - return err; -} - -unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) -{ - return instruction_pointer(regs); -} - -int __weak kgdb_arch_init(void) -{ - return 0; -} - -int __weak kgdb_skipexception(int exception, struct pt_regs *regs) -{ - return 0; -} - -/* - * Some architectures need cache flushes when we set/clear a - * breakpoint: - */ -static void kgdb_flush_swbreak_addr(unsigned long addr) -{ - if (!CACHE_FLUSH_IS_SAFE) - return; - - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); - } - /* Force flush instruction cache if it was outside the mm */ - flush_icache_range(addr, addr + BREAK_INSTR_SIZE); -} - -/* - * SW breakpoint management: - */ -int dbg_activate_sw_breakpoints(void) -{ - unsigned long addr; - int error; - int ret = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_SET) - continue; - - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_set_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) { - ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", addr); - continue; - } - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_ACTIVE; - } - return ret; -} - -int dbg_set_sw_break(unsigned long addr) -{ - int err = kgdb_validate_break_address(addr); - int breakno = -1; - int i; - - if (err) - return err; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) - return -EEXIST; - } - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_REMOVED && - kgdb_break[i].bpt_addr == addr) { - breakno = i; - break; - } - } - - if (breakno == -1) { - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state == BP_UNDEFINED) { - breakno = i; - break; - } - } - } - - if (breakno == -1) - return -E2BIG; - - kgdb_break[breakno].state = BP_SET; - kgdb_break[breakno].type = BP_BREAKPOINT; - kgdb_break[breakno].bpt_addr = addr; - - return 0; -} - -int dbg_deactivate_sw_breakpoints(void) -{ - unsigned long addr; - int error; - int ret = 0; - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); - ret = error; - } - - kgdb_flush_swbreak_addr(addr); - kgdb_break[i].state = BP_SET; - } - return ret; -} - -int dbg_remove_sw_break(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_SET) && - (kgdb_break[i].bpt_addr == addr)) { - kgdb_break[i].state = BP_REMOVED; - return 0; - } - } - return -ENOENT; -} - -int kgdb_isremovedbreak(unsigned long addr) -{ - int i; - - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if ((kgdb_break[i].state == BP_REMOVED) && - (kgdb_break[i].bpt_addr == addr)) - return 1; - } - return 0; -} - -int dbg_remove_all_break(void) -{ - unsigned long addr; - int error; - int i; - - /* Clear memory breakpoints. */ - for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { - if (kgdb_break[i].state != BP_ACTIVE) - goto setundefined; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); - if (error) - printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", - addr); -setundefined: - kgdb_break[i].state = BP_UNDEFINED; - } - - /* Clear hardware breakpoints. */ - if (arch_kgdb_ops.remove_all_hw_break) - arch_kgdb_ops.remove_all_hw_break(); - - return 0; -} - -/* - * Return true if there is a valid kgdb I/O module. Also if no - * debugger is attached a message can be printed to the console about - * waiting for the debugger to attach. - * - * The print_wait argument is only to be true when called from inside - * the core kgdb_handle_exception, because it will wait for the - * debugger to attach. - */ -static int kgdb_io_ready(int print_wait) -{ - if (!dbg_io_ops) - return 0; - if (kgdb_connected) - return 1; - if (atomic_read(&kgdb_setting_breakpoint)) - return 1; - if (print_wait) { -#ifdef CONFIG_KGDB_KDB - if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); -#else - printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); -#endif - } - return 1; -} - -static int kgdb_reenter_check(struct kgdb_state *ks) -{ - unsigned long addr; - - if (atomic_read(&kgdb_active) != raw_smp_processor_id()) - return 0; - - /* Panic on recursive debugger calls: */ - exception_level++; - addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); - dbg_deactivate_sw_breakpoints(); - - /* - * If the break point removed ok at the place exception - * occurred, try to recover and print a warning to the end - * user because the user planted a breakpoint in a place that - * KGDB needs in order to function. - */ - if (dbg_remove_sw_break(addr) == 0) { - exception_level = 0; - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - dbg_activate_sw_breakpoints(); - printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", - addr); - WARN_ON_ONCE(1); - - return 1; - } - dbg_remove_all_break(); - kgdb_skipexception(ks->ex_vector, ks->linux_regs); - - if (exception_level > 1) { - dump_stack(); - panic("Recursive entry to debugger"); - } - - printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); -#ifdef CONFIG_KGDB_KDB - /* Allow kdb to debug itself one level */ - return 0; -#endif - dump_stack(); - panic("Recursive entry to debugger"); - - return 1; -} - -static void dbg_touch_watchdogs(void) -{ - touch_softlockup_watchdog_sync(); - clocksource_touch_watchdog(); - rcu_cpu_stall_reset(); -} - -static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, - int exception_state) -{ - unsigned long flags; - int sstep_tries = 100; - int error; - int cpu; - int trace_on = 0; - int online_cpus = num_online_cpus(); - - kgdb_info[ks->cpu].enter_kgdb++; - kgdb_info[ks->cpu].exception_state |= exception_state; - - if (exception_state == DCPU_WANT_MASTER) - atomic_inc(&masters_in_kgdb); - else - atomic_inc(&slaves_in_kgdb); - - if (arch_kgdb_ops.disable_hw_break) - arch_kgdb_ops.disable_hw_break(regs); - -acquirelock: - /* - * Interrupts will be restored by the 'trap return' code, except when - * single stepping. - */ - local_irq_save(flags); - - cpu = ks->cpu; - kgdb_info[cpu].debuggerinfo = regs; - kgdb_info[cpu].task = current; - kgdb_info[cpu].ret_state = 0; - kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; - - /* Make sure the above info reaches the primary CPU */ - smp_mb(); - - if (exception_level == 1) { - if (raw_spin_trylock(&dbg_master_lock)) - atomic_xchg(&kgdb_active, cpu); - goto cpu_master_loop; - } - - /* - * CPU will loop if it is a slave or request to become a kgdb - * master cpu and acquire the kgdb_active lock: - */ - while (1) { -cpu_loop: - if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) { - kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; - goto cpu_master_loop; - } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { - if (raw_spin_trylock(&dbg_master_lock)) { - atomic_xchg(&kgdb_active, cpu); - break; - } - } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { - if (!raw_spin_is_locked(&dbg_slave_lock)) - goto return_normal; - } else { -return_normal: - /* Return to normal operation by executing any - * hw breakpoint fixup. - */ - if (arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - if (trace_on) - tracing_on(); - kgdb_info[cpu].exception_state &= - ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); - kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); - atomic_dec(&slaves_in_kgdb); - dbg_touch_watchdogs(); - local_irq_restore(flags); - return 0; - } - cpu_relax(); - } - - /* - * For single stepping, try to only enter on the processor - * that was single stepping. To guard against a deadlock, the - * kernel will only try for the value of sstep_tries before - * giving up and continuing on. - */ - if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - (kgdb_info[cpu].task && - kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { - atomic_set(&kgdb_active, -1); - raw_spin_unlock(&dbg_master_lock); - dbg_touch_watchdogs(); - local_irq_restore(flags); - - goto acquirelock; - } - - if (!kgdb_io_ready(1)) { - kgdb_info[cpu].ret_state = 1; - goto kgdb_restore; /* No I/O connection, resume the system */ - } - - /* - * Don't enter if we have hit a removed breakpoint. - */ - if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) - goto kgdb_restore; - - /* Call the I/O driver's pre_exception routine */ - if (dbg_io_ops->pre_exception) - dbg_io_ops->pre_exception(); - - /* - * Get the passive CPU lock which will hold all the non-primary - * CPU in a spin state while the debugger is active - */ - if (!kgdb_single_step) - raw_spin_lock(&dbg_slave_lock); - -#ifdef CONFIG_SMP - /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step) && kgdb_do_roundup) - kgdb_roundup_cpus(flags); -#endif - - /* - * Wait for the other CPUs to be notified and be waiting for us: - */ - while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + - atomic_read(&slaves_in_kgdb)) != online_cpus) - cpu_relax(); - - /* - * At this point the primary processor is completely - * in the debugger and all secondary CPUs are quiescent - */ - dbg_deactivate_sw_breakpoints(); - kgdb_single_step = 0; - kgdb_contthread = current; - exception_level = 0; - trace_on = tracing_is_on(); - if (trace_on) - tracing_off(); - - while (1) { -cpu_master_loop: - if (dbg_kdb_mode) { - kgdb_connected = 1; - error = kdb_stub(ks); - if (error == -1) - continue; - kgdb_connected = 0; - } else { - error = gdb_serial_stub(ks); - } - - if (error == DBG_PASS_EVENT) { - dbg_kdb_mode = !dbg_kdb_mode; - } else if (error == DBG_SWITCH_CPU_EVENT) { - kgdb_info[dbg_switch_cpu].exception_state |= - DCPU_NEXT_MASTER; - goto cpu_loop; - } else { - kgdb_info[cpu].ret_state = error; - break; - } - } - - /* Call the I/O driver's post_exception routine */ - if (dbg_io_ops->post_exception) - dbg_io_ops->post_exception(); - - if (!kgdb_single_step) { - raw_spin_unlock(&dbg_slave_lock); - /* Wait till all the CPUs have quit from the debugger. */ - while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) - cpu_relax(); - } - -kgdb_restore: - if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { - int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); - if (kgdb_info[sstep_cpu].task) - kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; - else - kgdb_sstep_pid = 0; - } - if (arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - if (trace_on) - tracing_on(); - - kgdb_info[cpu].exception_state &= - ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); - kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); - atomic_dec(&masters_in_kgdb); - /* Free kgdb_active */ - atomic_set(&kgdb_active, -1); - raw_spin_unlock(&dbg_master_lock); - dbg_touch_watchdogs(); - local_irq_restore(flags); - - return kgdb_info[cpu].ret_state; -} - -/* - * kgdb_handle_exception() - main entry point from a kernel exception - * - * Locking hierarchy: - * interface locks, if any (begin_session) - * kgdb lock (kgdb_active) - */ -int -kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) -{ - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; - - ks->cpu = raw_smp_processor_id(); - ks->ex_vector = evector; - ks->signo = signo; - ks->err_code = ecode; - ks->kgdb_usethreadid = 0; - ks->linux_regs = regs; - - if (kgdb_reenter_check(ks)) - return 0; /* Ouch, double exception ! */ - if (kgdb_info[ks->cpu].enter_kgdb != 0) - return 0; - - return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); -} - -int kgdb_nmicallback(int cpu, void *regs) -{ -#ifdef CONFIG_SMP - struct kgdb_state kgdb_var; - struct kgdb_state *ks = &kgdb_var; - - memset(ks, 0, sizeof(struct kgdb_state)); - ks->cpu = cpu; - ks->linux_regs = regs; - - if (kgdb_info[ks->cpu].enter_kgdb == 0 && - raw_spin_is_locked(&dbg_master_lock)) { - kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); - return 0; - } -#endif - return 1; -} - -static void kgdb_console_write(struct console *co, const char *s, - unsigned count) -{ - unsigned long flags; - - /* If we're debugging, or KGDB has not connected, don't try - * and print. */ - if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode) - return; - - local_irq_save(flags); - gdbstub_msg_write(s, count); - local_irq_restore(flags); -} - -static struct console kgdbcons = { - .name = "kgdb", - .write = kgdb_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED, - .index = -1, -}; - -#ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key) -{ - if (!dbg_io_ops) { - printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); - return; - } - if (!kgdb_connected) { -#ifdef CONFIG_KGDB_KDB - if (!dbg_kdb_mode) - printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); -#else - printk(KERN_CRIT "Entering KGDB\n"); -#endif - } - - kgdb_breakpoint(); -} - -static struct sysrq_key_op sysrq_dbg_op = { - .handler = sysrq_handle_dbg, - .help_msg = "debug(G)", - .action_msg = "DEBUG", -}; -#endif - -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) -{ - if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); - kgdb_breakpoint(); - return NOTIFY_DONE; -} - -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; - -void __weak kgdb_arch_late(void) -{ -} - -void __init dbg_late_init(void) -{ - dbg_is_early = false; - if (kgdb_io_module_registered) - kgdb_arch_late(); - kdb_init(KDB_INIT_FULL); -} - -static void kgdb_register_callbacks(void) -{ - if (!kgdb_io_module_registered) { - kgdb_io_module_registered = 1; - kgdb_arch_init(); - if (!dbg_is_early) - kgdb_arch_late(); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); -#ifdef CONFIG_MAGIC_SYSRQ - register_sysrq_key('g', &sysrq_dbg_op); -#endif - if (kgdb_use_con && !kgdb_con_registered) { - register_console(&kgdbcons); - kgdb_con_registered = 1; - } - } -} - -static void kgdb_unregister_callbacks(void) -{ - /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any - * break exceptions at the time. - */ - if (kgdb_io_module_registered) { - kgdb_io_module_registered = 0; - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); - kgdb_arch_exit(); -#ifdef CONFIG_MAGIC_SYSRQ - unregister_sysrq_key('g', &sysrq_dbg_op); -#endif - if (kgdb_con_registered) { - unregister_console(&kgdbcons); - kgdb_con_registered = 0; - } - } -} - -/* - * There are times a tasklet needs to be used vs a compiled in - * break point so as to cause an exception outside a kgdb I/O module, - * such as is the case with kgdboe, where calling a breakpoint in the - * I/O driver itself would be fatal. - */ -static void kgdb_tasklet_bpt(unsigned long ing) -{ - kgdb_breakpoint(); - atomic_set(&kgdb_break_tasklet_var, 0); -} - -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); - -void kgdb_schedule_breakpoint(void) -{ - if (atomic_read(&kgdb_break_tasklet_var) || - atomic_read(&kgdb_active) != -1 || - atomic_read(&kgdb_setting_breakpoint)) - return; - atomic_inc(&kgdb_break_tasklet_var); - tasklet_schedule(&kgdb_tasklet_breakpoint); -} -EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); - -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - -/** - * kgdb_register_io_module - register KGDB IO module - * @new_dbg_io_ops: the io ops vector - * - * Register it with the KGDB core. - */ -int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) -{ - int err; - - spin_lock(&kgdb_registration_lock); - - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); - - printk(KERN_ERR "kgdb: Another I/O driver is already " - "registered with KGDB.\n"); - return -EBUSY; - } - - if (new_dbg_io_ops->init) { - err = new_dbg_io_ops->init(); - if (err) { - spin_unlock(&kgdb_registration_lock); - return err; - } - } - - dbg_io_ops = new_dbg_io_ops; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", - new_dbg_io_ops->name); - - /* Arm KGDB now. */ - kgdb_register_callbacks(); - - if (kgdb_break_asap) - kgdb_initial_breakpoint(); - - return 0; -} -EXPORT_SYMBOL_GPL(kgdb_register_io_module); - -/** - * kkgdb_unregister_io_module - unregister KGDB IO module - * @old_dbg_io_ops: the io ops vector - * - * Unregister it with the KGDB core. - */ -void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) -{ - BUG_ON(kgdb_connected); - - /* - * KGDB is no longer able to communicate out, so - * unregister our callbacks and reset state. - */ - kgdb_unregister_callbacks(); - - spin_lock(&kgdb_registration_lock); - - WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops); - dbg_io_ops = NULL; - - spin_unlock(&kgdb_registration_lock); - - printk(KERN_INFO - "kgdb: Unregistered I/O driver %s, debugger disabled.\n", - old_dbg_io_ops->name); -} -EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); - -int dbg_io_get_char(void) -{ - int ret = dbg_io_ops->read_char(); - if (ret == NO_POLL_CHAR) - return -1; - if (!dbg_kdb_mode) - return ret; - if (ret == 127) - return 8; - return ret; -} - -/** - * kgdb_breakpoint - generate breakpoint exception - * - * This function will generate a breakpoint exception. It is used at the - * beginning of a program to sync up with a debugger and can be used - * otherwise as a quick means to stop program execution and "break" into - * the debugger. - */ -void kgdb_breakpoint(void) -{ - atomic_inc(&kgdb_setting_breakpoint); - wmb(); /* Sync point before breakpoint */ - arch_kgdb_breakpoint(); - wmb(); /* Sync point after breakpoint */ - atomic_dec(&kgdb_setting_breakpoint); -} -EXPORT_SYMBOL_GPL(kgdb_breakpoint); - -static int __init opt_kgdb_wait(char *str) -{ - kgdb_break_asap = 1; - - kdb_init(KDB_INIT_EARLY); - if (kgdb_io_module_registered) - kgdb_initial_breakpoint(); - - return 0; -} - -early_param("kgdbwait", opt_kgdb_wait); -/* - * Kernel Debug Core - * - * Maintainer: Jason Wessel - * - * Copyright (C) 2000-2001 VERITAS Software Corporation. - * Copyright (C) 2002-2004 Timesys Corporation - * Copyright (C) 2003-2004 Amit S. Kale - * Copyright (C) 2004 Pavel Machek - * Copyright (C) 2004-2006 Tom Rini - * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. - * Copyright (C) 2005-2009 Wind River Systems, Inc. - * Copyright (C) 2007 MontaVista Software, Inc. - * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar - * - * Contributors at various stages not listed above: - * Jason Wessel ( jason.wessel@windriver.com ) - * George Anzinger - * Anurekh Saxena (anurekh.saxena@timesys.com) - * Lake Stevens Instrument Division (Glenn Engel) - * Jim Kingdon, Cygnus Support. - * - * Original KGDB stub: David Grothe , - * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "debug_core.h" - -#define KGDB_MAX_THREAD_QUERY 17 - -/* Our I/O buffers. */ -static char remcom_in_buffer[BUFMAX]; -static char remcom_out_buffer[BUFMAX]; -static int gdbstub_use_prev_in_buf; -static int gdbstub_prev_in_buf_pos; - -/* Storage for the registers, in GDB format. */ -static unsigned long gdb_regs[(NUMREGBYTES + - sizeof(unsigned long) - 1) / - sizeof(unsigned long)]; - -/* - * GDB remote protocol parser: - */ - -#ifdef CONFIG_KGDB_KDB -static int gdbstub_read_wait(void) -{ - int ret = -1; - int i; - - if (unlikely(gdbstub_use_prev_in_buf)) { - if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) - return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; - else - gdbstub_use_prev_in_buf = 0; - } - - /* poll any additional I/O interfaces that are defined */ - while (ret < 0) - for (i = 0; kdb_poll_funcs[i] != NULL; i++) { - ret = kdb_poll_funcs[i](); - if (ret > 0) - break; - } - return ret; -} -#else -static int gdbstub_read_wait(void) -{ - int ret = dbg_io_ops->read_char(); - while (ret == NO_POLL_CHAR) - ret = dbg_io_ops->read_char(); - return ret; -} -#endif -/* scan for the sequence $# */ -static void get_packet(char *buffer) -{ - unsigned char checksum; - unsigned char xmitcsum; - int count; - char ch; - - do { - /* - * Spin and wait around for the start character, ignore all - * other characters: - */ - while ((ch = (gdbstub_read_wait())) != '$') - /* nothing */; - - kgdb_connected = 1; - checksum = 0; - xmitcsum = -1; - - count = 0; - - /* - * now, read until a # or end of buffer is found: - */ - while (count < (BUFMAX - 1)) { - ch = gdbstub_read_wait(); - if (ch == '#') - break; - checksum = checksum + ch; - buffer[count] = ch; - count = count + 1; - } - - if (ch == '#') { - xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; - xmitcsum += hex_to_bin(gdbstub_read_wait()); - - if (checksum != xmitcsum) - /* failed checksum */ - dbg_io_ops->write_char('-'); - else - /* successful transfer */ - dbg_io_ops->write_char('+'); - if (dbg_io_ops->flush) - dbg_io_ops->flush(); - } - buffer[count] = 0; - } while (checksum != xmitcsum); -} - -/* - * Send the packet in buffer. - * Check for gdb connection if asked for. - */ -static void put_packet(char *buffer) -{ - unsigned char checksum; - int count; - char ch; - - /* - * $#. - */ - while (1) { - dbg_io_ops->write_char('$'); - checksum = 0; - count = 0; - - while ((ch = buffer[count])) { - dbg_io_ops->write_char(ch); - checksum += ch; - count++; - } - - dbg_io_ops->write_char('#'); - dbg_io_ops->write_char(hex_asc_hi(checksum)); - dbg_io_ops->write_char(hex_asc_lo(checksum)); - if (dbg_io_ops->flush) - dbg_io_ops->flush(); - - /* Now see what we get in reply. */ - ch = gdbstub_read_wait(); - - if (ch == 3) - ch = gdbstub_read_wait(); - - /* If we get an ACK, we are done. */ - if (ch == '+') - return; - - /* - * If we get the start of another packet, this means - * that GDB is attempting to reconnect. We will NAK - * the packet being sent, and stop trying to send this - * packet. - */ - if (ch == '$') { - dbg_io_ops->write_char('-'); - if (dbg_io_ops->flush) - dbg_io_ops->flush(); - return; - } - } -} - -static char gdbmsgbuf[BUFMAX + 1]; - -void gdbstub_msg_write(const char *s, int len) -{ - char *bufptr; - int wcount; - int i; - - if (len == 0) - len = strlen(s); - - /* 'O'utput */ - gdbmsgbuf[0] = 'O'; - - /* Fill and send buffers... */ - while (len > 0) { - bufptr = gdbmsgbuf + 1; - - /* Calculate how many this time */ - if ((len << 1) > (BUFMAX - 2)) - wcount = (BUFMAX - 2) >> 1; - else - wcount = len; - - /* Pack in hex chars */ - for (i = 0; i < wcount; i++) - bufptr = hex_byte_pack(bufptr, s[i]); - *bufptr = '\0'; - - /* Move up */ - s += wcount; - len -= wcount; - - /* Write packet */ - put_packet(gdbmsgbuf); - } -} - -/* - * Convert the memory pointed to by mem into hex, placing result in - * buf. Return a pointer to the last char put in buf (null). May - * return an error. - */ -char *kgdb_mem2hex(char *mem, char *buf, int count) -{ - char *tmp; - int err; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory copy. Hex conversion will work against this one. - */ - tmp = buf + count; - - err = probe_kernel_read(tmp, mem, count); - if (err) - return NULL; - while (count > 0) { - buf = hex_byte_pack(buf, *tmp); - tmp++; - count--; - } - *buf = 0; - - return buf; -} - -/* - * Convert the hex array pointed to by buf into binary to be placed in - * mem. Return a pointer to the character AFTER the last byte - * written. May return an error. - */ -int kgdb_hex2mem(char *buf, char *mem, int count) -{ - char *tmp_raw; - char *tmp_hex; - - /* - * We use the upper half of buf as an intermediate buffer for the - * raw memory that is converted from hex. - */ - tmp_raw = buf + count * 2; - - tmp_hex = tmp_raw - 1; - while (tmp_hex >= buf) { - tmp_raw--; - *tmp_raw = hex_to_bin(*tmp_hex--); - *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; - } - - return probe_kernel_write(mem, tmp_raw, count); -} - -/* - * While we find nice hex chars, build a long_val. - * Return number of chars processed. - */ -int kgdb_hex2long(char **ptr, unsigned long *long_val) -{ - int hex_val; - int num = 0; - int negate = 0; - - *long_val = 0; - - if (**ptr == '-') { - negate = 1; - (*ptr)++; - } - while (**ptr) { - hex_val = hex_to_bin(**ptr); - if (hex_val < 0) - break; - - *long_val = (*long_val << 4) | hex_val; - num++; - (*ptr)++; - } - - if (negate) - *long_val = -*long_val; - - return num; -} - -/* - * Copy the binary array pointed to by buf into mem. Fix $, #, and - * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. - * The input buf is overwitten with the result to write to mem. - */ -static int kgdb_ebin2mem(char *buf, char *mem, int count) -{ - int size = 0; - char *c = buf; - - while (count-- > 0) { - c[size] = *buf++; - if (c[size] == 0x7d) - c[size] = *buf++ ^ 0x20; - size++; - } - - return probe_kernel_write(mem, c, size); -} - -#if DBG_MAX_REG_NUM > 0 -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ - int i; - int idx = 0; - char *ptr = (char *)gdb_regs; - - for (i = 0; i < DBG_MAX_REG_NUM; i++) { - dbg_get_reg(i, ptr + idx, regs); - idx += dbg_reg_def[i].size; - } -} - -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ - int i; - int idx = 0; - char *ptr = (char *)gdb_regs; - - for (i = 0; i < DBG_MAX_REG_NUM; i++) { - dbg_set_reg(i, ptr + idx, regs); - idx += dbg_reg_def[i].size; - } -} -#endif /* DBG_MAX_REG_NUM > 0 */ - -/* Write memory due to an 'M' or 'X' packet. */ -static int write_mem_msg(int binary) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long addr; - unsigned long length; - int err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && - kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { - if (binary) - err = kgdb_ebin2mem(ptr, (char *)addr, length); - else - err = kgdb_hex2mem(ptr, (char *)addr, length); - if (err) - return err; - if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length); - return 0; - } - - return -EINVAL; -} - -static void error_packet(char *pkt, int error) -{ - error = -error; - pkt[0] = 'E'; - pkt[1] = hex_asc[(error / 10)]; - pkt[2] = hex_asc[(error % 10)]; - pkt[3] = '\0'; -} - -/* - * Thread ID accessors. We represent a flat TID space to GDB, where - * the per CPU idle threads (which under Linux all have PID 0) are - * remapped to negative TIDs. - */ - -#define BUF_THREAD_ID_SIZE 8 - -static char *pack_threadid(char *pkt, unsigned char *id) -{ - unsigned char *limit; - int lzero = 1; - - limit = id + (BUF_THREAD_ID_SIZE / 2); - while (id < limit) { - if (!lzero || *id != 0) { - pkt = hex_byte_pack(pkt, *id); - lzero = 0; - } - id++; - } - - if (lzero) - pkt = hex_byte_pack(pkt, 0); - - return pkt; -} - -static void int_to_threadref(unsigned char *id, int value) -{ - put_unaligned_be32(value, id); -} - -static struct task_struct *getthread(struct pt_regs *regs, int tid) -{ - /* - * Non-positive TIDs are remapped to the cpu shadow information - */ - if (tid == 0 || tid == -1) - tid = -atomic_read(&kgdb_active) - 2; - if (tid < -1 && tid > -NR_CPUS - 2) { - if (kgdb_info[-tid - 2].task) - return kgdb_info[-tid - 2].task; - else - return idle_task(-tid - 2); - } - if (tid <= 0) { - printk(KERN_ERR "KGDB: Internal thread select error\n"); - dump_stack(); - return NULL; - } - - /* - * find_task_by_pid_ns() does not take the tasklist lock anymore - * but is nicely RCU locked - hence is a pretty resilient - * thing to use: - */ - return find_task_by_pid_ns(tid, &init_pid_ns); -} - - -/* - * Remap normal tasks to their real PID, - * CPU shadow threads are mapped to -CPU - 2 - */ -static inline int shadow_pid(int realpid) -{ - if (realpid) - return realpid; - - return -raw_smp_processor_id() - 2; -} - -/* - * All the functions that start with gdb_cmd are the various - * operations to implement the handlers for the gdbserial protocol - * where KGDB is communicating with an external debugger - */ - -/* Handle the '?' status packets */ -static void gdb_cmd_status(struct kgdb_state *ks) -{ - /* - * We know that this packet is only sent - * during initial connect. So to be safe, - * we clear out our breakpoints now in case - * GDB is reconnecting. - */ - dbg_remove_all_break(); - - remcom_out_buffer[0] = 'S'; - hex_byte_pack(&remcom_out_buffer[1], ks->signo); -} - -static void gdb_get_regs_helper(struct kgdb_state *ks) -{ - struct task_struct *thread; - void *local_debuggerinfo; - int i; - - thread = kgdb_usethread; - if (!thread) { - thread = kgdb_info[ks->cpu].task; - local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; - } else { - local_debuggerinfo = NULL; - for_each_online_cpu(i) { - /* - * Try to find the task on some other - * or possibly this node if we do not - * find the matching task then we try - * to approximate the results. - */ - if (thread == kgdb_info[i].task) - local_debuggerinfo = kgdb_info[i].debuggerinfo; - } - } - - /* - * All threads that don't have debuggerinfo should be - * in schedule() sleeping, since all other CPUs - * are in kgdb_wait, and thus have debuggerinfo. - */ - if (local_debuggerinfo) { - pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); - } else { - /* - * Pull stuff saved during switch_to; nothing - * else is accessible (or even particularly - * relevant). - * - * This should be enough for a stack trace. - */ - sleeping_thread_to_gdb_regs(gdb_regs, thread); - } -} - -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) -{ - gdb_get_regs_helper(ks); - kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); -} - -/* Handle the 'G' set registers request */ -static void gdb_cmd_setregs(struct kgdb_state *ks) -{ - kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); - - if (kgdb_usethread && kgdb_usethread != current) { - error_packet(remcom_out_buffer, -EINVAL); - } else { - gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); - } -} - -/* Handle the 'm' memory read bytes */ -static void gdb_cmd_memread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - unsigned long length; - unsigned long addr; - char *err; - - if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && - kgdb_hex2long(&ptr, &length) > 0) { - err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (!err) - error_packet(remcom_out_buffer, -EINVAL); - } else { - error_packet(remcom_out_buffer, -EINVAL); - } -} - -/* Handle the 'M' memory write bytes */ -static void gdb_cmd_memwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(0); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -#if DBG_MAX_REG_NUM > 0 -static char *gdb_hex_reg_helper(int regnum, char *out) -{ - int i; - int offset = 0; - - for (i = 0; i < regnum; i++) - offset += dbg_reg_def[i].size; - return kgdb_mem2hex((char *)gdb_regs + offset, out, - dbg_reg_def[i].size); -} - -/* Handle the 'p' individual regster get */ -static void gdb_cmd_reg_get(struct kgdb_state *ks) -{ - unsigned long regnum; - char *ptr = &remcom_in_buffer[1]; - - kgdb_hex2long(&ptr, ®num); - if (regnum >= DBG_MAX_REG_NUM) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - gdb_get_regs_helper(ks); - gdb_hex_reg_helper(regnum, remcom_out_buffer); -} - -/* Handle the 'P' individual regster set */ -static void gdb_cmd_reg_set(struct kgdb_state *ks) -{ - unsigned long regnum; - char *ptr = &remcom_in_buffer[1]; - int i = 0; - - kgdb_hex2long(&ptr, ®num); - if (*ptr++ != '=' || - !(!kgdb_usethread || kgdb_usethread == current) || - !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - memset(gdb_regs, 0, sizeof(gdb_regs)); - while (i < sizeof(gdb_regs) * 2) - if (hex_to_bin(ptr[i]) >= 0) - i++; - else - break; - i = i / 2; - kgdb_hex2mem(ptr, (char *)gdb_regs, i); - dbg_set_reg(regnum, gdb_regs, ks->linux_regs); - strcpy(remcom_out_buffer, "OK"); -} -#endif /* DBG_MAX_REG_NUM > 0 */ - -/* Handle the 'X' memory binary write bytes */ -static void gdb_cmd_binwrite(struct kgdb_state *ks) -{ - int err = write_mem_msg(1); - - if (err) - error_packet(remcom_out_buffer, err); - else - strcpy(remcom_out_buffer, "OK"); -} - -/* Handle the 'D' or 'k', detach or kill packets */ -static void gdb_cmd_detachkill(struct kgdb_state *ks) -{ - int error; - - /* The detach case */ - if (remcom_in_buffer[0] == 'D') { - error = dbg_remove_all_break(); - if (error < 0) { - error_packet(remcom_out_buffer, error); - } else { - strcpy(remcom_out_buffer, "OK"); - kgdb_connected = 0; - } - put_packet(remcom_out_buffer); - } else { - /* - * Assume the kill case, with no exit code checking, - * trying to force detach the debugger: - */ - dbg_remove_all_break(); - kgdb_connected = 0; - } -} - -/* Handle the 'R' reboot packets */ -static int gdb_cmd_reboot(struct kgdb_state *ks) -{ - /* For now, only honor R0 */ - if (strcmp(remcom_in_buffer, "R0") == 0) { - printk(KERN_CRIT "Executing emergency reboot\n"); - strcpy(remcom_out_buffer, "OK"); - put_packet(remcom_out_buffer); - - /* - * Execution should not return from - * machine_emergency_restart() - */ - machine_emergency_restart(); - kgdb_connected = 0; - - return 1; - } - return 0; -} - -/* Handle the 'q' query packets */ -static void gdb_cmd_query(struct kgdb_state *ks) -{ - struct task_struct *g; - struct task_struct *p; - unsigned char thref[BUF_THREAD_ID_SIZE]; - char *ptr; - int i; - int cpu; - int finished = 0; - - switch (remcom_in_buffer[1]) { - case 's': - case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) - break; - - i = 0; - remcom_out_buffer[0] = 'm'; - ptr = remcom_out_buffer + 1; - if (remcom_in_buffer[1] == 'f') { - /* Each cpu is a shadow thread */ - for_each_online_cpu(cpu) { - ks->thr_query = 0; - int_to_threadref(thref, -cpu - 2); - ptr = pack_threadid(ptr, thref); - *(ptr++) = ','; - i++; - } - } - - do_each_thread(g, p) { - if (i >= ks->thr_query && !finished) { - int_to_threadref(thref, p->pid); - ptr = pack_threadid(ptr, thref); - *(ptr++) = ','; - ks->thr_query++; - if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) - finished = 1; - } - i++; - } while_each_thread(g, p); - - *(--ptr) = '\0'; - break; - - case 'C': - /* Current thread id */ - strcpy(remcom_out_buffer, "QC"); - ks->threadid = shadow_pid(current->pid); - int_to_threadref(thref, ks->threadid); - pack_threadid(remcom_out_buffer + 2, thref); - break; - case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) - break; - - ks->threadid = 0; - ptr = remcom_in_buffer + 17; - kgdb_hex2long(&ptr, &ks->threadid); - if (!getthread(ks->linux_regs, ks->threadid)) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - if ((int)ks->threadid > 0) { - kgdb_mem2hex(getthread(ks->linux_regs, - ks->threadid)->comm, - remcom_out_buffer, 16); - } else { - static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - - sprintf(tmpstr, "shadowCPU%d", - (int)(-ks->threadid - 2)); - kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); - } - break; -#ifdef CONFIG_KGDB_KDB - case 'R': - if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) { - int len = strlen(remcom_in_buffer + 6); - - if ((len % 2) != 0) { - strcpy(remcom_out_buffer, "E01"); - break; - } - kgdb_hex2mem(remcom_in_buffer + 6, - remcom_out_buffer, len); - len = len / 2; - remcom_out_buffer[len++] = 0; - - kdb_parse(remcom_out_buffer); - strcpy(remcom_out_buffer, "OK"); - } - break; -#endif - } -} - -/* Handle the 'H' task query packets */ -static void gdb_cmd_task(struct kgdb_state *ks) -{ - struct task_struct *thread; - char *ptr; - - switch (remcom_in_buffer[1]) { - case 'g': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_usethread = thread; - ks->kgdb_usethreadid = ks->threadid; - strcpy(remcom_out_buffer, "OK"); - break; - case 'c': - ptr = &remcom_in_buffer[2]; - kgdb_hex2long(&ptr, &ks->threadid); - if (!ks->threadid) { - kgdb_contthread = NULL; - } else { - thread = getthread(ks->linux_regs, ks->threadid); - if (!thread && ks->threadid > 0) { - error_packet(remcom_out_buffer, -EINVAL); - break; - } - kgdb_contthread = thread; - } - strcpy(remcom_out_buffer, "OK"); - break; - } -} - -/* Handle the 'T' thread query packets */ -static void gdb_cmd_thread(struct kgdb_state *ks) -{ - char *ptr = &remcom_in_buffer[1]; - struct task_struct *thread; - - kgdb_hex2long(&ptr, &ks->threadid); - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, -EINVAL); -} - -/* Handle the 'z' or 'Z' breakpoint remove or set packets */ -static void gdb_cmd_break(struct kgdb_state *ks) -{ - /* - * Since GDB-5.3, it's been drafted that '0' is a software - * breakpoint, '1' is a hardware breakpoint, so let's do that. - */ - char *bpt_type = &remcom_in_buffer[1]; - char *ptr = &remcom_in_buffer[2]; - unsigned long addr; - unsigned long length; - int error = 0; - - if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { - /* Unsupported */ - if (*bpt_type > '4') - return; - } else { - if (*bpt_type != '0' && *bpt_type != '1') - /* Unsupported. */ - return; - } - - /* - * Test if this is a hardware breakpoint, and - * if we support it: - */ - if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) - /* Unsupported. */ - return; - - if (*(ptr++) != ',') { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (!kgdb_hex2long(&ptr, &addr)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - if (*(ptr++) != ',' || - !kgdb_hex2long(&ptr, &length)) { - error_packet(remcom_out_buffer, -EINVAL); - return; - } - - if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') - error = dbg_set_sw_break(addr); - else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') - error = dbg_remove_sw_break(addr); - else if (remcom_in_buffer[0] == 'Z') - error = arch_kgdb_ops.set_hw_breakpoint(addr, - (int)length, *bpt_type - '0'); - else if (remcom_in_buffer[0] == 'z') - error = arch_kgdb_ops.remove_hw_breakpoint(addr, - (int) length, *bpt_type - '0'); - - if (error == 0) - strcpy(remcom_out_buffer, "OK"); - else - error_packet(remcom_out_buffer, error); -} - -/* Handle the 'C' signal / exception passing packets */ -static int gdb_cmd_exception_pass(struct kgdb_state *ks) -{ - /* C09 == pass exception - * C15 == detach kgdb, pass exception - */ - if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'c'; - - } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { - - ks->pass_exception = 1; - remcom_in_buffer[0] = 'D'; - dbg_remove_all_break(); - kgdb_connected = 0; - return 1; - - } else { - gdbstub_msg_write("KGDB only knows signal 9 (pass)" - " and 15 (pass and disconnect)\n" - "Executing a continue without signal passing\n", 0); - remcom_in_buffer[0] = 'c'; - } - - /* Indicate fall through */ - return -1; -} - -/* - * This function performs all gdbserial command procesing - */ -int gdb_serial_stub(struct kgdb_state *ks) -{ - int error = 0; - int tmp; - - /* Initialize comm buffer and globals. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - - if (kgdb_connected) { - unsigned char thref[BUF_THREAD_ID_SIZE]; - char *ptr; - - /* Reply to host that an exception has occurred */ - ptr = remcom_out_buffer; - *ptr++ = 'T'; - ptr = hex_byte_pack(ptr, ks->signo); - ptr += strlen(strcpy(ptr, "thread:")); - int_to_threadref(thref, shadow_pid(current->pid)); - ptr = pack_threadid(ptr, thref); - *ptr++ = ';'; - put_packet(remcom_out_buffer); - } - - while (1) { - error = 0; - - /* Clear the out buffer. */ - memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); - - get_packet(remcom_in_buffer); - - switch (remcom_in_buffer[0]) { - case '?': /* gdbserial status */ - gdb_cmd_status(ks); - break; - case 'g': /* return the value of the CPU registers */ - gdb_cmd_getregs(ks); - break; - case 'G': /* set the value of the CPU registers - return OK */ - gdb_cmd_setregs(ks); - break; - case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ - gdb_cmd_memread(ks); - break; - case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_memwrite(ks); - break; -#if DBG_MAX_REG_NUM > 0 - case 'p': /* pXX Return gdb register XX (in hex) */ - gdb_cmd_reg_get(ks); - break; - case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ - gdb_cmd_reg_set(ks); - break; -#endif /* DBG_MAX_REG_NUM > 0 */ - case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ - gdb_cmd_binwrite(ks); - break; - /* kill or detach. KGDB should treat this like a - * continue. - */ - case 'D': /* Debugger detach */ - case 'k': /* Debugger detach via kill */ - gdb_cmd_detachkill(ks); - goto default_handle; - case 'R': /* Reboot */ - if (gdb_cmd_reboot(ks)) - goto default_handle; - break; - case 'q': /* query command */ - gdb_cmd_query(ks); - break; - case 'H': /* task related */ - gdb_cmd_task(ks); - break; - case 'T': /* Query thread status */ - gdb_cmd_thread(ks); - break; - case 'z': /* Break point remove */ - case 'Z': /* Break point set */ - gdb_cmd_break(ks); - break; -#ifdef CONFIG_KGDB_KDB - case '3': /* Escape into back into kdb */ - if (remcom_in_buffer[1] == '\0') { - gdb_cmd_detachkill(ks); - return DBG_PASS_EVENT; - } -#endif - case 'C': /* Exception passing */ - tmp = gdb_cmd_exception_pass(ks); - if (tmp > 0) - goto default_handle; - if (tmp == 0) - break; - /* Fall through on tmp < 0 */ - case 'c': /* Continue packet */ - case 's': /* Single step packet */ - if (kgdb_contthread && kgdb_contthread != current) { - /* Can't switch threads in kgdb */ - error_packet(remcom_out_buffer, -EINVAL); - break; - } - dbg_activate_sw_breakpoints(); - /* Fall through to default processing */ - default: -default_handle: - error = kgdb_arch_handle_exception(ks->ex_vector, - ks->signo, - ks->err_code, - remcom_in_buffer, - remcom_out_buffer, - ks->linux_regs); - /* - * Leave cmd processing on error, detach, - * kill, continue, or single step. - */ - if (error >= 0 || remcom_in_buffer[0] == 'D' || - remcom_in_buffer[0] == 'k') { - error = 0; - goto kgdb_exit; - } - - } - - /* reply to the request */ - put_packet(remcom_out_buffer); - } - -kgdb_exit: - if (ks->pass_exception) - error = 1; - return error; -} - -int gdbstub_state(struct kgdb_state *ks, char *cmd) -{ - int error; - - switch (cmd[0]) { - case 'e': - error = kgdb_arch_handle_exception(ks->ex_vector, - ks->signo, - ks->err_code, - remcom_in_buffer, - remcom_out_buffer, - ks->linux_regs); - return error; - case 's': - case 'c': - strcpy(remcom_in_buffer, cmd); - return 0; - case '$': - strcpy(remcom_in_buffer, cmd); - gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); - gdbstub_prev_in_buf_pos = 0; - return 0; - } - dbg_io_ops->write_char('+'); - put_packet(remcom_out_buffer); - return 0; -} - -/** - * gdbstub_exit - Send an exit message to GDB - * @status: The exit code to report. - */ -void gdbstub_exit(int status) -{ - unsigned char checksum, ch, buffer[3]; - int loop; - - buffer[0] = 'W'; - buffer[1] = hex_asc_hi(status); - buffer[2] = hex_asc_lo(status); - - dbg_io_ops->write_char('$'); - checksum = 0; - - for (loop = 0; loop < 3; loop++) { - ch = buffer[loop]; - checksum += ch; - dbg_io_ops->write_char(ch); - } - - dbg_io_ops->write_char('#'); - dbg_io_ops->write_char(hex_asc_hi(checksum)); - dbg_io_ops->write_char(hex_asc_lo(checksum)); - - /* make sure the output is flushed, lest the bootloader clobber it */ - dbg_io_ops->flush(); -} -/* - * Kernel Debugger Architecture Independent Breakpoint Handler - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "kdb_private.h" - -/* - * Table of kdb_breakpoints - */ -kdb_bp_t kdb_breakpoints[KDB_MAXBPT]; - -static void kdb_setsinglestep(struct pt_regs *regs) -{ - KDB_STATE_SET(DOING_SS); -} - -static char *kdb_rwtypes[] = { - "Instruction(i)", - "Instruction(Register)", - "Data Write", - "I/O", - "Data Access" -}; - -static char *kdb_bptype(kdb_bp_t *bp) -{ - if (bp->bp_type < 0 || bp->bp_type > 4) - return ""; - - return kdb_rwtypes[bp->bp_type]; -} - -static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) -{ - int nextarg = *nextargp; - int diag; - - bp->bph_length = 1; - if ((argc + 1) != nextarg) { - if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) - bp->bp_type = BP_ACCESS_WATCHPOINT; - else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) - bp->bp_type = BP_WRITE_WATCHPOINT; - else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) - bp->bp_type = BP_HARDWARE_BREAKPOINT; - else - return KDB_ARGCOUNT; - - bp->bph_length = 1; - - nextarg++; - - if ((argc + 1) != nextarg) { - unsigned long len; - - diag = kdbgetularg((char *)argv[nextarg], - &len); - if (diag) - return diag; - - - if (len > 8) - return KDB_BADLENGTH; - - bp->bph_length = len; - nextarg++; - } - - if ((argc + 1) != nextarg) - return KDB_ARGCOUNT; - } - - *nextargp = nextarg; - return 0; -} - -static int _kdb_bp_remove(kdb_bp_t *bp) -{ - int ret = 1; - if (!bp->bp_installed) - return ret; - if (!bp->bp_type) - ret = dbg_remove_sw_break(bp->bp_addr); - else - ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr, - bp->bph_length, - bp->bp_type); - if (ret == 0) - bp->bp_installed = 0; - return ret; -} - -static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp) -{ - if (KDB_DEBUG(BP)) - kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs)); - - /* - * Setup single step - */ - kdb_setsinglestep(regs); - - /* - * Reset delay attribute - */ - bp->bp_delay = 0; - bp->bp_delayed = 1; -} - -static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) -{ - int ret; - /* - * Install the breakpoint, if it is not already installed. - */ - - if (KDB_DEBUG(BP)) - kdb_printf("%s: bp_installed %d\n", - __func__, bp->bp_installed); - if (!KDB_STATE(SSBPT)) - bp->bp_delay = 0; - if (bp->bp_installed) - return 1; - if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) { - if (KDB_DEBUG(BP)) - kdb_printf("%s: delayed bp\n", __func__); - kdb_handle_bp(regs, bp); - return 0; - } - if (!bp->bp_type) - ret = dbg_set_sw_break(bp->bp_addr); - else - ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr, - bp->bph_length, - bp->bp_type); - if (ret == 0) { - bp->bp_installed = 1; - } else { - kdb_printf("%s: failed to set breakpoint at 0x%lx\n", - __func__, bp->bp_addr); - return 1; - } - return 0; -} - -/* - * kdb_bp_install - * - * Install kdb_breakpoints prior to returning from the - * kernel debugger. This allows the kdb_breakpoints to be set - * upon functions that are used internally by kdb, such as - * printk(). This function is only called once per kdb session. - */ -void kdb_bp_install(struct pt_regs *regs) -{ - int i; - - for (i = 0; i < KDB_MAXBPT; i++) { - kdb_bp_t *bp = &kdb_breakpoints[i]; - - if (KDB_DEBUG(BP)) { - kdb_printf("%s: bp %d bp_enabled %d\n", - __func__, i, bp->bp_enabled); - } - if (bp->bp_enabled) - _kdb_bp_install(regs, bp); - } -} - -/* - * kdb_bp_remove - * - * Remove kdb_breakpoints upon entry to the kernel debugger. - * - * Parameters: - * None. - * Outputs: - * None. - * Returns: - * None. - * Locking: - * None. - * Remarks: - */ -void kdb_bp_remove(void) -{ - int i; - - for (i = KDB_MAXBPT - 1; i >= 0; i--) { - kdb_bp_t *bp = &kdb_breakpoints[i]; - - if (KDB_DEBUG(BP)) { - kdb_printf("%s: bp %d bp_enabled %d\n", - __func__, i, bp->bp_enabled); - } - if (bp->bp_enabled) - _kdb_bp_remove(bp); - } -} - - -/* - * kdb_printbp - * - * Internal function to format and print a breakpoint entry. - * - * Parameters: - * None. - * Outputs: - * None. - * Returns: - * None. - * Locking: - * None. - * Remarks: - */ - -static void kdb_printbp(kdb_bp_t *bp, int i) -{ - kdb_printf("%s ", kdb_bptype(bp)); - kdb_printf("BP #%d at ", i); - kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); - - if (bp->bp_enabled) - kdb_printf("\n is enabled"); - else - kdb_printf("\n is disabled"); - - kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", - bp->bp_addr, bp->bp_type, bp->bp_installed); - - kdb_printf("\n"); -} - -/* - * kdb_bp - * - * Handle the bp commands. - * - * [bp|bph] [DATAR|DATAW] - * - * Parameters: - * argc Count of arguments in argv - * argv Space delimited command line arguments - * Outputs: - * None. - * Returns: - * Zero for success, a kdb diagnostic if failure. - * Locking: - * None. - * Remarks: - * - * bp Set breakpoint on all cpus. Only use hardware assist if need. - * bph Set breakpoint on all cpus. Force hardware register - */ - -static int kdb_bp(int argc, const char **argv) -{ - int i, bpno; - kdb_bp_t *bp, *bp_check; - int diag; - char *symname = NULL; - long offset = 0ul; - int nextarg; - kdb_bp_t template = {0}; - - if (argc == 0) { - /* - * Display breakpoint table - */ - for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; - bpno++, bp++) { - if (bp->bp_free) - continue; - kdb_printbp(bp, bpno); - } - - return 0; - } - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr, - &offset, &symname); - if (diag) - return diag; - if (!template.bp_addr) - return KDB_BADINT; - - /* - * Find an empty bp structure to allocate - */ - for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { - if (bp->bp_free) - break; - } - - if (bpno == KDB_MAXBPT) - return KDB_TOOMANYBPT; - - if (strcmp(argv[0], "bph") == 0) { - template.bp_type = BP_HARDWARE_BREAKPOINT; - diag = kdb_parsebp(argc, argv, &nextarg, &template); - if (diag) - return diag; - } else { - template.bp_type = BP_BREAKPOINT; - } - - /* - * Check for clashing breakpoints. - * - * Note, in this design we can't have hardware breakpoints - * enabled for both read and write on the same address. - */ - for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT; - i++, bp_check++) { - if (!bp_check->bp_free && - bp_check->bp_addr == template.bp_addr) { - kdb_printf("You already have a breakpoint at " - kdb_bfd_vma_fmt0 "\n", template.bp_addr); - return KDB_DUPBPT; - } - } - - template.bp_enabled = 1; - - /* - * Actually allocate the breakpoint found earlier - */ - *bp = template; - bp->bp_free = 0; - - kdb_printbp(bp, bpno); - - return 0; -} - -/* - * kdb_bc - * - * Handles the 'bc', 'be', and 'bd' commands - * - * [bd|bc|be] - * [bd|bc|be] * - * - * Parameters: - * argc Count of arguments in argv - * argv Space delimited command line arguments - * Outputs: - * None. - * Returns: - * Zero for success, a kdb diagnostic for failure - * Locking: - * None. - * Remarks: - */ -static int kdb_bc(int argc, const char **argv) -{ - unsigned long addr; - kdb_bp_t *bp = NULL; - int lowbp = KDB_MAXBPT; - int highbp = 0; - int done = 0; - int i; - int diag = 0; - - int cmd; /* KDBCMD_B? */ -#define KDBCMD_BC 0 -#define KDBCMD_BE 1 -#define KDBCMD_BD 2 - - if (strcmp(argv[0], "be") == 0) - cmd = KDBCMD_BE; - else if (strcmp(argv[0], "bd") == 0) - cmd = KDBCMD_BD; - else - cmd = KDBCMD_BC; - - if (argc != 1) - return KDB_ARGCOUNT; - - if (strcmp(argv[1], "*") == 0) { - lowbp = 0; - highbp = KDB_MAXBPT; - } else { - diag = kdbgetularg(argv[1], &addr); - if (diag) - return diag; - - /* - * For addresses less than the maximum breakpoint number, - * assume that the breakpoint number is desired. - */ - if (addr < KDB_MAXBPT) { - bp = &kdb_breakpoints[addr]; - lowbp = highbp = addr; - highbp++; - } else { - for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; - i++, bp++) { - if (bp->bp_addr == addr) { - lowbp = highbp = i; - highbp++; - break; - } - } - } - } - - /* - * Now operate on the set of breakpoints matching the input - * criteria (either '*' for all, or an individual breakpoint). - */ - for (bp = &kdb_breakpoints[lowbp], i = lowbp; - i < highbp; - i++, bp++) { - if (bp->bp_free) - continue; - - done++; - - switch (cmd) { - case KDBCMD_BC: - bp->bp_enabled = 0; - - kdb_printf("Breakpoint %d at " - kdb_bfd_vma_fmt " cleared\n", - i, bp->bp_addr); - - bp->bp_addr = 0; - bp->bp_free = 1; - - break; - case KDBCMD_BE: - bp->bp_enabled = 1; - - kdb_printf("Breakpoint %d at " - kdb_bfd_vma_fmt " enabled", - i, bp->bp_addr); - - kdb_printf("\n"); - break; - case KDBCMD_BD: - if (!bp->bp_enabled) - break; - - bp->bp_enabled = 0; - - kdb_printf("Breakpoint %d at " - kdb_bfd_vma_fmt " disabled\n", - i, bp->bp_addr); - - break; - } - if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) { - bp->bp_delay = 0; - KDB_STATE_CLEAR(SSBPT); - } - } - - return (!done) ? KDB_BPTNOTFOUND : 0; -} - -/* - * kdb_ss - * - * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) - * commands. - * - * ss - * ssb - * - * Parameters: - * argc Argument count - * argv Argument vector - * Outputs: - * None. - * Returns: - * KDB_CMD_SS[B] for success, a kdb error if failure. - * Locking: - * None. - * Remarks: - * - * Set the arch specific option to trigger a debug trap after the next - * instruction. - * - * For 'ssb', set the trace flag in the debug trap handler - * after printing the current insn and return directly without - * invoking the kdb command processor, until a branch instruction - * is encountered. - */ - -static int kdb_ss(int argc, const char **argv) -{ - int ssb = 0; - - ssb = (strcmp(argv[0], "ssb") == 0); - if (argc != 0) - return KDB_ARGCOUNT; - /* - * Set trace flag and go. - */ - KDB_STATE_SET(DOING_SS); - if (ssb) { - KDB_STATE_SET(DOING_SSB); - return KDB_CMD_SSB; - } - return KDB_CMD_SS; -} - -/* Initialize the breakpoint table and register breakpoint commands. */ - -void __init kdb_initbptab(void) -{ - int i; - kdb_bp_t *bp; - - /* - * First time initialization. - */ - memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints)); - - for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) - bp->bp_free = 1; - - kdb_register_repeat("bp", kdb_bp, "[]", - "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bl", kdb_bp, "[]", - "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); - if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) - kdb_register_repeat("bph", kdb_bp, "[]", - "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("bc", kdb_bc, "", - "Clear Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("be", kdb_bc, "", - "Enable Breakpoint", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bd", kdb_bc, "", - "Disable Breakpoint", 0, KDB_REPEAT_NONE); - - kdb_register_repeat("ss", kdb_ss, "", - "Single Step", 1, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("ssb", kdb_ss, "", - "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); - /* - * Architecture dependent initialization. - */ -} -/* - * Kernel Debugger Architecture Independent Stack Traceback - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "kdb_private.h" - - -static void kdb_show_stack(struct task_struct *p, void *addr) -{ - int old_lvl = console_loglevel; - console_loglevel = 15; - kdb_trap_printk++; - kdb_set_current_task(p); - if (addr) { - show_stack((struct task_struct *)p, addr); - } else if (kdb_current_regs) { -#ifdef CONFIG_X86 - show_stack(p, &kdb_current_regs->sp); -#else - show_stack(p, NULL); -#endif - } else { - show_stack(p, NULL); - } - console_loglevel = old_lvl; - kdb_trap_printk--; -} - -/* - * kdb_bt - * - * This function implements the 'bt' command. Print a stack - * traceback. - * - * bt [] (addr-exp is for alternate stacks) - * btp Kernel stack for - * btt Kernel stack for task structure at - * - * bta [DRSTCZEUIMA] All useful processes, optionally - * filtered by state - * btc [] The current process on one cpu, - * default is all cpus - * - * bt refers to a address on the stack, that location - * is assumed to contain a return address. - * - * btt refers to the address of a struct task. - * - * Inputs: - * argc argument count - * argv argument vector - * Outputs: - * None. - * Returns: - * zero for success, a kdb diagnostic if error - * Locking: - * none. - * Remarks: - * Backtrack works best when the code uses frame pointers. But even - * without frame pointers we should get a reasonable trace. - * - * mds comes in handy when examining the stack to do a manual traceback or - * to get a starting point for bt . - */ - -static int -kdb_bt1(struct task_struct *p, unsigned long mask, - int argcount, int btaprompt) -{ - char buffer[2]; - if (kdb_getarea(buffer[0], (unsigned long)p) || - kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) - return KDB_BADADDR; - if (!kdb_task_state(p, mask)) - return 0; - kdb_printf("Stack traceback for pid %d\n", p->pid); - kdb_ps1(p); - kdb_show_stack(p, NULL); - if (btaprompt) { - kdb_getstr(buffer, sizeof(buffer), - "Enter to end, to continue:"); - if (buffer[0] == 'q') { - kdb_printf("\n"); - return 1; - } - } - touch_nmi_watchdog(); - return 0; -} - -int -kdb_bt(int argc, const char **argv) -{ - int diag; - int argcount = 5; - int btaprompt = 1; - int nextarg; - unsigned long addr; - long offset; - - /* Prompt after each proc in bta */ - kdbgetintenv("BTAPROMPT", &btaprompt); - - if (strcmp(argv[0], "bta") == 0) { - struct task_struct *g, *p; - unsigned long cpu; - unsigned long mask = kdb_task_state_string(argc ? argv[1] : - NULL); - if (argc == 0) - kdb_ps_suppressed(); - /* Run the active tasks first */ - for_each_online_cpu(cpu) { - p = kdb_curr_task(cpu); - if (kdb_bt1(p, mask, argcount, btaprompt)) - return 0; - } - /* Now the inactive tasks */ - kdb_do_each_thread(g, p) { - if (task_curr(p)) - continue; - if (kdb_bt1(p, mask, argcount, btaprompt)) - return 0; - } kdb_while_each_thread(g, p); - } else if (strcmp(argv[0], "btp") == 0) { - struct task_struct *p; - unsigned long pid; - if (argc != 1) - return KDB_ARGCOUNT; - diag = kdbgetularg((char *)argv[1], &pid); - if (diag) - return diag; - p = find_task_by_pid_ns(pid, &init_pid_ns); - if (p) { - kdb_set_current_task(p); - return kdb_bt1(p, ~0UL, argcount, 0); - } - kdb_printf("No process with pid == %ld found\n", pid); - return 0; - } else if (strcmp(argv[0], "btt") == 0) { - if (argc != 1) - return KDB_ARGCOUNT; - diag = kdbgetularg((char *)argv[1], &addr); - if (diag) - return diag; - kdb_set_current_task((struct task_struct *)addr); - return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); - } else if (strcmp(argv[0], "btc") == 0) { - unsigned long cpu = ~0; - struct task_struct *save_current_task = kdb_current_task; - char buf[80]; - if (argc > 1) - return KDB_ARGCOUNT; - if (argc == 1) { - diag = kdbgetularg((char *)argv[1], &cpu); - if (diag) - return diag; - } - /* Recursive use of kdb_parse, do not use argv after - * this point */ - argv = NULL; - if (cpu != ~0) { - if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { - kdb_printf("no process for cpu %ld\n", cpu); - return 0; - } - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); - kdb_parse(buf); - return 0; - } - kdb_printf("btc: cpu status: "); - kdb_parse("cpu\n"); - for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); - kdb_parse(buf); - touch_nmi_watchdog(); - } - kdb_set_current_task(save_current_task); - return 0; - } else { - if (argc) { - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, - &offset, NULL); - if (diag) - return diag; - kdb_show_stack(kdb_current_task, (void *)addr); - return 0; - } else { - return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); - } - } - - /* NOTREACHED */ - return 0; -} -/* - * Created by: Jason Wessel - * - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#include -#include -#include -#include -#include "kdb_private.h" -#include "../debug_core.h" - -/* - * KDB interface to KGDB internals - */ -get_char_func kdb_poll_funcs[] = { - dbg_io_get_char, - NULL, - NULL, - NULL, - NULL, - NULL, -}; -EXPORT_SYMBOL_GPL(kdb_poll_funcs); - -int kdb_poll_idx = 1; -EXPORT_SYMBOL_GPL(kdb_poll_idx); - -static struct kgdb_state *kdb_ks; - -int kdb_stub(struct kgdb_state *ks) -{ - int error = 0; - kdb_bp_t *bp; - unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); - kdb_reason_t reason = KDB_REASON_OOPS; - kdb_dbtrap_t db_result = KDB_DB_NOBPT; - int i; - - kdb_ks = ks; - if (KDB_STATE(REENTRY)) { - reason = KDB_REASON_SWITCH; - KDB_STATE_CLEAR(REENTRY); - addr = instruction_pointer(ks->linux_regs); - } - ks->pass_exception = 0; - if (atomic_read(&kgdb_setting_breakpoint)) - reason = KDB_REASON_KEYBOARD; - - for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { - if ((bp->bp_enabled) && (bp->bp_addr == addr)) { - reason = KDB_REASON_BREAK; - db_result = KDB_DB_BPT; - if (addr != instruction_pointer(ks->linux_regs)) - kgdb_arch_set_pc(ks->linux_regs, addr); - break; - } - } - if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) { - for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { - if (bp->bp_free) - continue; - if (bp->bp_addr == addr) { - bp->bp_delay = 1; - bp->bp_delayed = 1; - /* - * SSBPT is set when the kernel debugger must single step a - * task in order to re-establish an instruction breakpoint - * which uses the instruction replacement mechanism. It is - * cleared by any action that removes the need to single-step - * the breakpoint. - */ - reason = KDB_REASON_BREAK; - db_result = KDB_DB_BPT; - KDB_STATE_SET(SSBPT); - break; - } - } - } - - if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 && - ks->signo == SIGTRAP) { - reason = KDB_REASON_SSTEP; - db_result = KDB_DB_BPT; - } - /* Set initial kdb state variables */ - KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = atomic_read(&kgdb_active); - kdb_current_task = kgdb_info[ks->cpu].task; - kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; - /* Remove any breakpoints as needed by kdb and clear single step */ - kdb_bp_remove(); - KDB_STATE_CLEAR(DOING_SS); - KDB_STATE_CLEAR(DOING_SSB); - KDB_STATE_SET(PAGER); - /* zero out any offline cpu data */ - for_each_present_cpu(i) { - if (!cpu_online(i)) { - kgdb_info[i].debuggerinfo = NULL; - kgdb_info[i].task = NULL; - } - } - if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { - ks->pass_exception = 1; - KDB_FLAG_SET(CATASTROPHIC); - } - if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { - KDB_STATE_CLEAR(SSBPT); - KDB_STATE_CLEAR(DOING_SS); - } else { - /* Start kdb main loop */ - error = kdb_main_loop(KDB_REASON_ENTER, reason, - ks->err_code, db_result, ks->linux_regs); - } - /* - * Upon exit from the kdb main loop setup break points and restart - * the system based on the requested continue state - */ - kdb_initial_cpu = -1; - kdb_current_task = NULL; - kdb_current_regs = NULL; - KDB_STATE_CLEAR(PAGER); - kdbnearsym_cleanup(); - if (error == KDB_CMD_KGDB) { - if (KDB_STATE(DOING_KGDB)) - KDB_STATE_CLEAR(DOING_KGDB); - return DBG_PASS_EVENT; - } - kdb_bp_install(ks->linux_regs); - dbg_activate_sw_breakpoints(); - /* Set the exit state to a single step or a continue */ - if (KDB_STATE(DOING_SS)) - gdbstub_state(ks, "s"); - else - gdbstub_state(ks, "c"); - - KDB_FLAG_CLEAR(CATASTROPHIC); - - /* Invoke arch specific exception handling prior to system resume */ - kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e"); - if (ks->pass_exception) - kgdb_info[ks->cpu].ret_state = 1; - if (error == KDB_CMD_CPU) { - KDB_STATE_SET(REENTRY); - /* - * Force clear the single step bit because kdb emulates this - * differently vs the gdbstub - */ - kgdb_single_step = 0; - dbg_deactivate_sw_breakpoints(); - return DBG_SWITCH_CPU_EVENT; - } - return kgdb_info[ks->cpu].ret_state; -} - -void kdb_gdb_state_pass(char *buf) -{ - gdbstub_state(kdb_ks, buf); -} -/* - * Kernel Debugger Architecture Independent Console I/O handler - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "kdb_private.h" - -#define CMD_BUFLEN 256 -char kdb_prompt_str[CMD_BUFLEN]; - -int kdb_trap_printk; - -static int kgdb_transition_check(char *buffer) -{ - if (buffer[0] != '+' && buffer[0] != '$') { - KDB_STATE_SET(KGDB_TRANS); - kdb_printf("%s", buffer); - } else { - int slen = strlen(buffer); - if (slen > 3 && buffer[slen - 3] == '#') { - kdb_gdb_state_pass(buffer); - strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB); - return 1; - } - } - return 0; -} - -static int kdb_read_get_key(char *buffer, size_t bufsize) -{ -#define ESCAPE_UDELAY 1000 -#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ - char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ - char *ped = escape_data; - int escape_delay = 0; - get_char_func *f, *f_escape = NULL; - int key; - - for (f = &kdb_poll_funcs[0]; ; ++f) { - if (*f == NULL) { - /* Reset NMI watchdog once per poll loop */ - touch_nmi_watchdog(); - f = &kdb_poll_funcs[0]; - } - if (escape_delay == 2) { - *ped = '\0'; - ped = escape_data; - --escape_delay; - } - if (escape_delay == 1) { - key = *ped++; - if (!*ped) - --escape_delay; - break; - } - key = (*f)(); - if (key == -1) { - if (escape_delay) { - udelay(ESCAPE_UDELAY); - --escape_delay; - } - continue; - } - if (bufsize <= 2) { - if (key == '\r') - key = '\n'; - *buffer++ = key; - *buffer = '\0'; - return -1; - } - if (escape_delay == 0 && key == '\e') { - escape_delay = ESCAPE_DELAY; - ped = escape_data; - f_escape = f; - } - if (escape_delay) { - *ped++ = key; - if (f_escape != f) { - escape_delay = 2; - continue; - } - if (ped - escape_data == 1) { - /* \e */ - continue; - } else if (ped - escape_data == 2) { - /* \e */ - if (key != '[') - escape_delay = 2; - continue; - } else if (ped - escape_data == 3) { - /* \e[ */ - int mapkey = 0; - switch (key) { - case 'A': /* \e[A, up arrow */ - mapkey = 16; - break; - case 'B': /* \e[B, down arrow */ - mapkey = 14; - break; - case 'C': /* \e[C, right arrow */ - mapkey = 6; - break; - case 'D': /* \e[D, left arrow */ - mapkey = 2; - break; - case '1': /* dropthrough */ - case '3': /* dropthrough */ - /* \e[<1,3,4>], may be home, del, end */ - case '4': - mapkey = -1; - break; - } - if (mapkey != -1) { - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - } - continue; - } else if (ped - escape_data == 4) { - /* \e[<1,3,4> */ - int mapkey = 0; - if (key == '~') { - switch (escape_data[2]) { - case '1': /* \e[1~, home */ - mapkey = 1; - break; - case '3': /* \e[3~, del */ - mapkey = 4; - break; - case '4': /* \e[4~, end */ - mapkey = 5; - break; - } - } - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - continue; - } - } - break; /* A key to process */ - } - return key; -} - -/* - * kdb_read - * - * This function reads a string of characters, terminated by - * a newline, or by reaching the end of the supplied buffer, - * from the current kernel debugger console device. - * Parameters: - * buffer - Address of character buffer to receive input characters. - * bufsize - size, in bytes, of the character buffer - * Returns: - * Returns a pointer to the buffer containing the received - * character string. This string will be terminated by a - * newline character. - * Locking: - * No locks are required to be held upon entry to this - * function. It is not reentrant - it relies on the fact - * that while kdb is running on only one "master debug" cpu. - * Remarks: - * - * The buffer size must be >= 2. A buffer size of 2 means that the caller only - * wants a single key. - * - * An escape key could be the start of a vt100 control sequence such as \e[D - * (left arrow) or it could be a character in its own right. The standard - * method for detecting the difference is to wait for 2 seconds to see if there - * are any other characters. kdb is complicated by the lack of a timer service - * (interrupts are off), by multiple input sources and by the need to sometimes - * return after just one key. Escape sequence processing has to be done as - * states in the polling loop. - */ - -static char *kdb_read(char *buffer, size_t bufsize) -{ - char *cp = buffer; - char *bufend = buffer+bufsize-2; /* Reserve space for newline - * and null byte */ - char *lastchar; - char *p_tmp; - char tmp; - static char tmpbuffer[CMD_BUFLEN]; - int len = strlen(buffer); - int len_tmp; - int tab = 0; - int count; - int i; - int diag, dtab_count; - int key; - - - diag = kdbgetintenv("DTABCOUNT", &dtab_count); - if (diag) - dtab_count = 30; - - if (len > 0) { - cp += len; - if (*(buffer+len-1) == '\n') - cp--; - } - - lastchar = cp; - *cp = '\0'; - kdb_printf("%s", buffer); -poll_again: - key = kdb_read_get_key(buffer, bufsize); - if (key == -1) - return buffer; - if (key != 9) - tab = 0; - switch (key) { - case 8: /* backspace */ - if (cp > buffer) { - if (cp < lastchar) { - memcpy(tmpbuffer, cp, lastchar - cp); - memcpy(cp-1, tmpbuffer, lastchar - cp); - } - *(--lastchar) = '\0'; - --cp; - kdb_printf("\b%s \r", cp); - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; - } - break; - case 13: /* enter */ - *lastchar++ = '\n'; - *lastchar++ = '\0'; - if (!KDB_STATE(KGDB_TRANS)) { - KDB_STATE_SET(KGDB_TRANS); - kdb_printf("%s", buffer); - } - kdb_printf("\n"); - return buffer; - case 4: /* Del */ - if (cp < lastchar) { - memcpy(tmpbuffer, cp+1, lastchar - cp - 1); - memcpy(cp, tmpbuffer, lastchar - cp - 1); - *(--lastchar) = '\0'; - kdb_printf("%s \r", cp); - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; - } - break; - case 1: /* Home */ - if (cp > buffer) { - kdb_printf("\r"); - kdb_printf(kdb_prompt_str); - cp = buffer; - } - break; - case 5: /* End */ - if (cp < lastchar) { - kdb_printf("%s", cp); - cp = lastchar; - } - break; - case 2: /* Left */ - if (cp > buffer) { - kdb_printf("\b"); - --cp; - } - break; - case 14: /* Down */ - memset(tmpbuffer, ' ', - strlen(kdb_prompt_str) + (lastchar-buffer)); - *(tmpbuffer+strlen(kdb_prompt_str) + - (lastchar-buffer)) = '\0'; - kdb_printf("\r%s\r", tmpbuffer); - *lastchar = (char)key; - *(lastchar+1) = '\0'; - return lastchar; - case 6: /* Right */ - if (cp < lastchar) { - kdb_printf("%c", *cp); - ++cp; - } - break; - case 16: /* Up */ - memset(tmpbuffer, ' ', - strlen(kdb_prompt_str) + (lastchar-buffer)); - *(tmpbuffer+strlen(kdb_prompt_str) + - (lastchar-buffer)) = '\0'; - kdb_printf("\r%s\r", tmpbuffer); - *lastchar = (char)key; - *(lastchar+1) = '\0'; - return lastchar; - case 9: /* Tab */ - if (tab < 2) - ++tab; - p_tmp = buffer; - while (*p_tmp == ' ') - p_tmp++; - if (p_tmp > cp) - break; - memcpy(tmpbuffer, p_tmp, cp-p_tmp); - *(tmpbuffer + (cp-p_tmp)) = '\0'; - p_tmp = strrchr(tmpbuffer, ' '); - if (p_tmp) - ++p_tmp; - else - p_tmp = tmpbuffer; - len = strlen(p_tmp); - count = kallsyms_symbol_complete(p_tmp, - sizeof(tmpbuffer) - - (p_tmp - tmpbuffer)); - if (tab == 2 && count > 0) { - kdb_printf("\n%d symbols are found.", count); - if (count > dtab_count) { - count = dtab_count; - kdb_printf(" But only first %d symbols will" - " be printed.\nYou can change the" - " environment variable DTABCOUNT.", - count); - } - kdb_printf("\n"); - for (i = 0; i < count; i++) { - if (kallsyms_symbol_next(p_tmp, i) < 0) - break; - kdb_printf("%s ", p_tmp); - *(p_tmp + len) = '\0'; - } - if (i >= dtab_count) - kdb_printf("..."); - kdb_printf("\n"); - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - } else if (tab != 2 && count > 0) { - len_tmp = strlen(p_tmp); - strncpy(p_tmp+len_tmp, cp, lastchar-cp+1); - len_tmp = strlen(p_tmp); - strncpy(cp, p_tmp+len, len_tmp-len + 1); - len = len_tmp - len; - kdb_printf("%s", cp); - cp += len; - lastchar += len; - } - kdb_nextline = 1; /* reset output line number */ - break; - default: - if (key >= 32 && lastchar < bufend) { - if (cp < lastchar) { - memcpy(tmpbuffer, cp, lastchar - cp); - memcpy(cp+1, tmpbuffer, lastchar - cp); - *++lastchar = '\0'; - *cp = key; - kdb_printf("%s\r", cp); - ++cp; - tmp = *cp; - *cp = '\0'; - kdb_printf(kdb_prompt_str); - kdb_printf("%s", buffer); - *cp = tmp; - } else { - *++lastchar = '\0'; - *cp++ = key; - /* The kgdb transition check will hide - * printed characters if we think that - * kgdb is connecting, until the check - * fails */ - if (!KDB_STATE(KGDB_TRANS)) { - if (kgdb_transition_check(buffer)) - return buffer; - } else { - kdb_printf("%c", key); - } - } - /* Special escape to kgdb */ - if (lastchar - buffer >= 5 && - strcmp(lastchar - 5, "$?#3f") == 0) { - kdb_gdb_state_pass(lastchar - 5); - strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB); - return buffer; - } - if (lastchar - buffer >= 11 && - strcmp(lastchar - 11, "$qSupported") == 0) { - kdb_gdb_state_pass(lastchar - 11); - strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB); - return buffer; - } - } - break; - } - goto poll_again; -} - -/* - * kdb_getstr - * - * Print the prompt string and read a command from the - * input device. - * - * Parameters: - * buffer Address of buffer to receive command - * bufsize Size of buffer in bytes - * prompt Pointer to string to use as prompt string - * Returns: - * Pointer to command buffer. - * Locking: - * None. - * Remarks: - * For SMP kernels, the processor number will be - * substituted for %d, %x or %o in the prompt. - */ - -char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) -{ - if (prompt && kdb_prompt_str != prompt) - strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); - kdb_printf(kdb_prompt_str); - kdb_nextline = 1; /* Prompt and input resets line number */ - return kdb_read(buffer, bufsize); -} - -/* - * kdb_input_flush - * - * Get rid of any buffered console input. - * - * Parameters: - * none - * Returns: - * nothing - * Locking: - * none - * Remarks: - * Call this function whenever you want to flush input. If there is any - * outstanding input, it ignores all characters until there has been no - * data for approximately 1ms. - */ - -static void kdb_input_flush(void) -{ - get_char_func *f; - int res; - int flush_delay = 1; - while (flush_delay) { - flush_delay--; -empty: - touch_nmi_watchdog(); - for (f = &kdb_poll_funcs[0]; *f; ++f) { - res = (*f)(); - if (res != -1) { - flush_delay = 1; - goto empty; - } - } - if (flush_delay) - mdelay(1); - } -} - -/* - * kdb_printf - * - * Print a string to the output device(s). - * - * Parameters: - * printf-like format and optional args. - * Returns: - * 0 - * Locking: - * None. - * Remarks: - * use 'kdbcons->write()' to avoid polluting 'log_buf' with - * kdb output. - * - * If the user is doing a cmd args | grep srch - * then kdb_grepping_flag is set. - * In that case we need to accumulate full lines (ending in \n) before - * searching for the pattern. - */ - -static char kdb_buffer[256]; /* A bit too big to go on stack */ -static char *next_avail = kdb_buffer; -static int size_avail; -static int suspend_grep; - -/* - * search arg1 to see if it contains arg2 - * (kdmain.c provides flags for ^pat and pat$) - * - * return 1 for found, 0 for not found - */ -static int kdb_search_string(char *searched, char *searchfor) -{ - char firstchar, *cp; - int len1, len2; - - /* not counting the newline at the end of "searched" */ - len1 = strlen(searched)-1; - len2 = strlen(searchfor); - if (len1 < len2) - return 0; - if (kdb_grep_leading && kdb_grep_trailing && len1 != len2) - return 0; - if (kdb_grep_leading) { - if (!strncmp(searched, searchfor, len2)) - return 1; - } else if (kdb_grep_trailing) { - if (!strncmp(searched+len1-len2, searchfor, len2)) - return 1; - } else { - firstchar = *searchfor; - cp = searched; - while ((cp = strchr(cp, firstchar))) { - if (!strncmp(cp, searchfor, len2)) - return 1; - cp++; - } - } - return 0; -} - -int vkdb_printf(const char *fmt, va_list ap) -{ - int diag; - int linecount; - int logging, saved_loglevel = 0; - int saved_trap_printk; - int got_printf_lock = 0; - int retlen = 0; - int fnd, len; - char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; - char *moreprompt = "more> "; - struct console *c = console_drivers; - static DEFINE_SPINLOCK(kdb_printf_lock); - unsigned long uninitialized_var(flags); - - preempt_disable(); - saved_trap_printk = kdb_trap_printk; - kdb_trap_printk = 0; - - /* Serialize kdb_printf if multiple cpus try to write at once. - * But if any cpu goes recursive in kdb, just print the output, - * even if it is interleaved with any other text. - */ - if (!KDB_STATE(PRINTF_LOCK)) { - KDB_STATE_SET(PRINTF_LOCK); - spin_lock_irqsave(&kdb_printf_lock, flags); - got_printf_lock = 1; - atomic_inc(&kdb_event); - } else { - __acquire(kdb_printf_lock); - } - - diag = kdbgetintenv("LINES", &linecount); - if (diag || linecount <= 1) - linecount = 24; - - diag = kdbgetintenv("LOGGING", &logging); - if (diag) - logging = 0; - - if (!kdb_grepping_flag || suspend_grep) { - /* normally, every vsnprintf starts a new buffer */ - next_avail = kdb_buffer; - size_avail = sizeof(kdb_buffer); - } - vsnprintf(next_avail, size_avail, fmt, ap); - - /* - * If kdb_parse() found that the command was cmd xxx | grep yyy - * then kdb_grepping_flag is set, and kdb_grep_string contains yyy - * - * Accumulate the print data up to a newline before searching it. - * (vsnprintf does null-terminate the string that it generates) - */ - - /* skip the search if prints are temporarily unconditional */ - if (!suspend_grep && kdb_grepping_flag) { - cp = strchr(kdb_buffer, '\n'); - if (!cp) { - /* - * Special cases that don't end with newlines - * but should be written without one: - * The "[nn]kdb> " prompt should - * appear at the front of the buffer. - * - * The "[nn]more " prompt should also be - * (MOREPROMPT -> moreprompt) - * written * but we print that ourselves, - * we set the suspend_grep flag to make - * it unconditional. - * - */ - if (next_avail == kdb_buffer) { - /* - * these should occur after a newline, - * so they will be at the front of the - * buffer - */ - cp2 = kdb_buffer; - len = strlen(kdb_prompt_str); - if (!strncmp(cp2, kdb_prompt_str, len)) { - /* - * We're about to start a new - * command, so we can go back - * to normal mode. - */ - kdb_grepping_flag = 0; - goto kdb_printit; - } - } - /* no newline; don't search/write the buffer - until one is there */ - len = strlen(kdb_buffer); - next_avail = kdb_buffer + len; - size_avail = sizeof(kdb_buffer) - len; - goto kdb_print_out; - } - - /* - * The newline is present; print through it or discard - * it, depending on the results of the search. - */ - cp++; /* to byte after the newline */ - replaced_byte = *cp; /* remember what/where it was */ - cphold = cp; - *cp = '\0'; /* end the string for our search */ - - /* - * We now have a newline at the end of the string - * Only continue with this output if it contains the - * search string. - */ - fnd = kdb_search_string(kdb_buffer, kdb_grep_string); - if (!fnd) { - /* - * At this point the complete line at the start - * of kdb_buffer can be discarded, as it does - * not contain what the user is looking for. - * Shift the buffer left. - */ - *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); - next_avail = kdb_buffer + len; - size_avail = sizeof(kdb_buffer) - len; - goto kdb_print_out; - } - /* - * at this point the string is a full line and - * should be printed, up to the null. - */ - } -kdb_printit: - - /* - * Write to all consoles. - */ - retlen = strlen(kdb_buffer); - if (!dbg_kdb_mode && kgdb_connected) { - gdbstub_msg_write(kdb_buffer, retlen); - } else { - if (!dbg_io_ops->is_console) { - len = strlen(kdb_buffer); - cp = kdb_buffer; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } - } - while (c) { - c->write(c, kdb_buffer, retlen); - touch_nmi_watchdog(); - c = c->next; - } - } - if (logging) { - saved_loglevel = console_loglevel; - console_loglevel = 0; - printk(KERN_INFO "%s", kdb_buffer); - } - - if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) - kdb_nextline++; - - /* check for having reached the LINES number of printed lines */ - if (kdb_nextline == linecount) { - char buf1[16] = ""; -#if defined(CONFIG_SMP) - char buf2[32]; -#endif - - /* Watch out for recursion here. Any routine that calls - * kdb_printf will come back through here. And kdb_read - * uses kdb_printf to echo on serial consoles ... - */ - kdb_nextline = 1; /* In case of recursion */ - - /* - * Pause until cr. - */ - moreprompt = kdbgetenv("MOREPROMPT"); - if (moreprompt == NULL) - moreprompt = "more> "; - -#if defined(CONFIG_SMP) - if (strchr(moreprompt, '%')) { - sprintf(buf2, moreprompt, get_cpu()); - put_cpu(); - moreprompt = buf2; - } -#endif - - kdb_input_flush(); - c = console_drivers; - - if (!dbg_io_ops->is_console) { - len = strlen(moreprompt); - cp = moreprompt; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } - } - while (c) { - c->write(c, moreprompt, strlen(moreprompt)); - touch_nmi_watchdog(); - c = c->next; - } - - if (logging) - printk("%s", moreprompt); - - kdb_read(buf1, 2); /* '2' indicates to return - * immediately after getting one key. */ - kdb_nextline = 1; /* Really set output line 1 */ - - /* empty and reset the buffer: */ - kdb_buffer[0] = '\0'; - next_avail = kdb_buffer; - size_avail = sizeof(kdb_buffer); - if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { - /* user hit q or Q */ - KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ - KDB_STATE_CLEAR(PAGER); - /* end of command output; back to normal mode */ - kdb_grepping_flag = 0; - kdb_printf("\n"); - } else if (buf1[0] == ' ') { - kdb_printf("\n"); - suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '\n') { - kdb_nextline = linecount - 1; - kdb_printf("\r"); - suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] && buf1[0] != '\n') { - /* user hit something other than enter */ - suspend_grep = 1; /* for this recursion */ - kdb_printf("\nOnly 'q' or 'Q' are processed at more " - "prompt, input ignored\n"); - } else if (kdb_grepping_flag) { - /* user hit enter */ - suspend_grep = 1; /* for this recursion */ - kdb_printf("\n"); - } - kdb_input_flush(); - } - - /* - * For grep searches, shift the printed string left. - * replaced_byte contains the character that was overwritten with - * the terminating null, and cphold points to the null. - * Then adjust the notion of available space in the buffer. - */ - if (kdb_grepping_flag && !suspend_grep) { - *cphold = replaced_byte; - strcpy(kdb_buffer, cphold); - len = strlen(kdb_buffer); - next_avail = kdb_buffer + len; - size_avail = sizeof(kdb_buffer) - len; - } - -kdb_print_out: - suspend_grep = 0; /* end of what may have been a recursive call */ - if (logging) - console_loglevel = saved_loglevel; - if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { - got_printf_lock = 0; - spin_unlock_irqrestore(&kdb_printf_lock, flags); - KDB_STATE_CLEAR(PRINTF_LOCK); - atomic_dec(&kdb_event); - } else { - __release(kdb_printf_lock); - } - kdb_trap_printk = saved_trap_printk; - preempt_enable(); - return retlen; -} - -int kdb_printf(const char *fmt, ...) -{ - va_list ap; - int r; - - va_start(ap, fmt); - r = vkdb_printf(fmt, ap); - va_end(ap); - - return r; -} -EXPORT_SYMBOL_GPL(kdb_printf); -/* - * Kernel Debugger Architecture Dependent Console I/O handler - * - * This file is subject to the terms and conditions of the GNU General Public - * License. - * - * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include - -/* Keyboard Controller Registers on normal PCs. */ - -#define KBD_STATUS_REG 0x64 /* Status register (R) */ -#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */ - -/* Status Register Bits */ - -#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */ -#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ - -static int kbd_exists; - -/* - * Check if the keyboard controller has a keypress for us. - * Some parts (Enter Release, LED change) are still blocking polled here, - * but hopefully they are all short. - */ -int kdb_get_kbd_char(void) -{ - int scancode, scanstatus; - static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */ - static int shift_key; /* Shift next keypress */ - static int ctrl_key; - u_short keychar; - - if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) || - (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) { - kbd_exists = 0; - return -1; - } - kbd_exists = 1; - - if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) - return -1; - - /* - * Fetch the scancode - */ - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - - /* - * Ignore mouse events. - */ - if (scanstatus & KBD_STAT_MOUSE_OBF) - return -1; - - /* - * Ignore release, trigger on make - * (except for shift keys, where we want to - * keep the shift state so long as the key is - * held down). - */ - - if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) { - /* - * Next key may use shift table - */ - if ((scancode & 0x80) == 0) - shift_key = 1; - else - shift_key = 0; - return -1; - } - - if ((scancode&0x7f) == 0x1d) { - /* - * Left ctrl key - */ - if ((scancode & 0x80) == 0) - ctrl_key = 1; - else - ctrl_key = 0; - return -1; - } - - if ((scancode & 0x80) != 0) - return -1; - - scancode &= 0x7f; - - /* - * Translate scancode - */ - - if (scancode == 0x3a) { - /* - * Toggle caps lock - */ - shift_lock ^= 1; - -#ifdef KDB_BLINK_LED - kdb_toggleled(0x4); -#endif - return -1; - } - - if (scancode == 0x0e) { - /* - * Backspace - */ - return 8; - } - - /* Special Key */ - switch (scancode) { - case 0xF: /* Tab */ - return 9; - case 0x53: /* Del */ - return 4; - case 0x47: /* Home */ - return 1; - case 0x4F: /* End */ - return 5; - case 0x4B: /* Left */ - return 2; - case 0x48: /* Up */ - return 16; - case 0x50: /* Down */ - return 14; - case 0x4D: /* Right */ - return 6; - } - - if (scancode == 0xe0) - return -1; - - /* - * For Japanese 86/106 keyboards - * See comment in drivers/char/pc_keyb.c. - * - Masahiro Adegawa - */ - if (scancode == 0x73) - scancode = 0x59; - else if (scancode == 0x7d) - scancode = 0x7c; - - if (!shift_lock && !shift_key && !ctrl_key) { - keychar = plain_map[scancode]; - } else if ((shift_lock || shift_key) && key_maps[1]) { - keychar = key_maps[1][scancode]; - } else if (ctrl_key && key_maps[4]) { - keychar = key_maps[4][scancode]; - } else { - keychar = 0x0020; - kdb_printf("Unknown state/scancode (%d)\n", scancode); - } - keychar &= 0x0fff; - if (keychar == '\t') - keychar = ' '; - switch (KTYP(keychar)) { - case KT_LETTER: - case KT_LATIN: - if (isprint(keychar)) - break; /* printable characters */ - /* drop through */ - case KT_SPEC: - if (keychar == K_ENTER) - break; - /* drop through */ - default: - return -1; /* ignore unprintables */ - } - - if ((scancode & 0x7f) == 0x1c) { - /* - * enter key. All done. Absorb the release scancode. - */ - while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) - ; - - /* - * Fetch the scancode - */ - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - - while (scanstatus & KBD_STAT_MOUSE_OBF) { - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - } - - if (scancode != 0x9c) { - /* - * Wasn't an enter-release, why not? - */ - kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", - scancode, scanstatus); - } - - return 13; - } - - return keychar & 0xff; -} -EXPORT_SYMBOL_GPL(kdb_get_kbd_char); -/* - * Kernel Debugger Architecture Independent Main Code - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (C) 2000 Stephane Eranian - * Xscale (R) modifications copyright (C) 2003 Intel Corporation. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "kdb_private.h" - -#define GREP_LEN 256 -char kdb_grep_string[GREP_LEN]; -int kdb_grepping_flag; -EXPORT_SYMBOL(kdb_grepping_flag); -int kdb_grep_leading; -int kdb_grep_trailing; - -/* - * Kernel debugger state flags - */ -int kdb_flags; -atomic_t kdb_event; - -/* - * kdb_lock protects updates to kdb_initial_cpu. Used to - * single thread processors through the kernel debugger. - */ -int kdb_initial_cpu = -1; /* cpu number that owns kdb */ -int kdb_nextline = 1; -int kdb_state; /* General KDB state */ - -struct task_struct *kdb_current_task; -EXPORT_SYMBOL(kdb_current_task); -struct pt_regs *kdb_current_regs; - -const char *kdb_diemsg; -static int kdb_go_count; -#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC -static unsigned int kdb_continue_catastrophic = - CONFIG_KDB_CONTINUE_CATASTROPHIC; -#else -static unsigned int kdb_continue_catastrophic; -#endif - -/* kdb_commands describes the available commands. */ -static kdbtab_t *kdb_commands; -#define KDB_BASE_CMD_MAX 50 -static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; -#define for_each_kdbcmd(cmd, num) \ - for ((cmd) = kdb_base_commands, (num) = 0; \ - num < kdb_max_commands; \ - num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) - -typedef struct _kdbmsg { - int km_diag; /* kdb diagnostic */ - char *km_msg; /* Corresponding message text */ -} kdbmsg_t; - -#define KDBMSG(msgnum, text) \ - { KDB_##msgnum, text } - -static kdbmsg_t kdbmsgs[] = { - KDBMSG(NOTFOUND, "Command Not Found"), - KDBMSG(ARGCOUNT, "Improper argument count, see usage."), - KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, " - "8 is only allowed on 64 bit systems"), - KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"), - KDBMSG(NOTENV, "Cannot find environment variable"), - KDBMSG(NOENVVALUE, "Environment variable should have value"), - KDBMSG(NOTIMP, "Command not implemented"), - KDBMSG(ENVFULL, "Environment full"), - KDBMSG(ENVBUFFULL, "Environment buffer full"), - KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), -#ifdef CONFIG_CPU_XSCALE - KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), -#else - KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"), -#endif - KDBMSG(DUPBPT, "Duplicate breakpoint address"), - KDBMSG(BPTNOTFOUND, "Breakpoint not found"), - KDBMSG(BADMODE, "Invalid IDMODE"), - KDBMSG(BADINT, "Illegal numeric value"), - KDBMSG(INVADDRFMT, "Invalid symbolic address format"), - KDBMSG(BADREG, "Invalid register name"), - KDBMSG(BADCPUNUM, "Invalid cpu number"), - KDBMSG(BADLENGTH, "Invalid length field"), - KDBMSG(NOBP, "No Breakpoint exists"), - KDBMSG(BADADDR, "Invalid address"), -}; -#undef KDBMSG - -static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); - - -/* - * Initial environment. This is all kept static and local to - * this file. We don't want to rely on the memory allocation - * mechanisms in the kernel, so we use a very limited allocate-only - * heap for new and altered environment variables. The entire - * environment is limited to a fixed number of entries (add more - * to __env[] if required) and a fixed amount of heap (add more to - * KDB_ENVBUFSIZE if required). - */ - -static char *__env[] = { -#if defined(CONFIG_SMP) - "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", -#else - "PROMPT=kdb> ", - "MOREPROMPT=more> ", -#endif - "RADIX=16", - "MDCOUNT=8", /* lines of md output */ - KDB_PLATFORM_ENV, - "DTABCOUNT=30", - "NOSECT=1", - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, - (char *)0, -}; - -static const int __nenv = (sizeof(__env) / sizeof(char *)); - -struct task_struct *kdb_curr_task(int cpu) -{ - struct task_struct *p = curr_task(cpu); -#ifdef _TIF_MCA_INIT - if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu)) - p = krp->p; -#endif - return p; -} - -/* - * kdbgetenv - This function will return the character string value of - * an environment variable. - * Parameters: - * match A character string representing an environment variable. - * Returns: - * NULL No environment variable matches 'match' - * char* Pointer to string value of environment variable. - */ -char *kdbgetenv(const char *match) -{ - char **ep = __env; - int matchlen = strlen(match); - int i; - - for (i = 0; i < __nenv; i++) { - char *e = *ep++; - - if (!e) - continue; - - if ((strncmp(match, e, matchlen) == 0) - && ((e[matchlen] == '\0') - || (e[matchlen] == '='))) { - char *cp = strchr(e, '='); - return cp ? ++cp : ""; - } - } - return NULL; -} - -/* - * kdballocenv - This function is used to allocate bytes for - * environment entries. - * Parameters: - * match A character string representing a numeric value - * Outputs: - * *value the unsigned long representation of the env variable 'match' - * Returns: - * Zero on success, a kdb diagnostic on failure. - * Remarks: - * We use a static environment buffer (envbuffer) to hold the values - * of dynamically generated environment variables (see kdb_set). Buffer - * space once allocated is never free'd, so over time, the amount of space - * (currently 512 bytes) will be exhausted if env variables are changed - * frequently. - */ -static char *kdballocenv(size_t bytes) -{ -#define KDB_ENVBUFSIZE 512 - static char envbuffer[KDB_ENVBUFSIZE]; - static int envbufsize; - char *ep = NULL; - - if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { - ep = &envbuffer[envbufsize]; - envbufsize += bytes; - } - return ep; -} - -/* - * kdbgetulenv - This function will return the value of an unsigned - * long-valued environment variable. - * Parameters: - * match A character string representing a numeric value - * Outputs: - * *value the unsigned long represntation of the env variable 'match' - * Returns: - * Zero on success, a kdb diagnostic on failure. - */ -static int kdbgetulenv(const char *match, unsigned long *value) -{ - char *ep; - - ep = kdbgetenv(match); - if (!ep) - return KDB_NOTENV; - if (strlen(ep) == 0) - return KDB_NOENVVALUE; - - *value = simple_strtoul(ep, NULL, 0); - - return 0; -} - -/* - * kdbgetintenv - This function will return the value of an - * integer-valued environment variable. - * Parameters: - * match A character string representing an integer-valued env variable - * Outputs: - * *value the integer representation of the environment variable 'match' - * Returns: - * Zero on success, a kdb diagnostic on failure. - */ -int kdbgetintenv(const char *match, int *value) -{ - unsigned long val; - int diag; - - diag = kdbgetulenv(match, &val); - if (!diag) - *value = (int) val; - return diag; -} - -/* - * kdbgetularg - This function will convert a numeric string into an - * unsigned long value. - * Parameters: - * arg A character string representing a numeric value - * Outputs: - * *value the unsigned long represntation of arg. - * Returns: - * Zero on success, a kdb diagnostic on failure. - */ -int kdbgetularg(const char *arg, unsigned long *value) -{ - char *endp; - unsigned long val; - - val = simple_strtoul(arg, &endp, 0); - - if (endp == arg) { - /* - * Also try base 16, for us folks too lazy to type the - * leading 0x... - */ - val = simple_strtoul(arg, &endp, 16); - if (endp == arg) - return KDB_BADINT; - } - - *value = val; - - return 0; -} - -int kdbgetu64arg(const char *arg, u64 *value) -{ - char *endp; - u64 val; - - val = simple_strtoull(arg, &endp, 0); - - if (endp == arg) { - - val = simple_strtoull(arg, &endp, 16); - if (endp == arg) - return KDB_BADINT; - } - - *value = val; - - return 0; -} - -/* - * kdb_set - This function implements the 'set' command. Alter an - * existing environment variable or create a new one. - */ -int kdb_set(int argc, const char **argv) -{ - int i; - char *ep; - size_t varlen, vallen; - - /* - * we can be invoked two ways: - * set var=value argv[1]="var", argv[2]="value" - * set var = value argv[1]="var", argv[2]="=", argv[3]="value" - * - if the latter, shift 'em down. - */ - if (argc == 3) { - argv[2] = argv[3]; - argc--; - } - - if (argc != 2) - return KDB_ARGCOUNT; - - /* - * Check for internal variables - */ - if (strcmp(argv[1], "KDBDEBUG") == 0) { - unsigned int debugflags; - char *cp; - - debugflags = simple_strtoul(argv[2], &cp, 0); - if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) { - kdb_printf("kdb: illegal debug flags '%s'\n", - argv[2]); - return 0; - } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) - | (debugflags << KDB_DEBUG_FLAG_SHIFT); - - return 0; - } - - /* - * Tokenizer squashed the '=' sign. argv[1] is variable - * name, argv[2] = value. - */ - varlen = strlen(argv[1]); - vallen = strlen(argv[2]); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; - - sprintf(ep, "%s=%s", argv[1], argv[2]); - - ep[varlen+vallen+1] = '\0'; - - for (i = 0; i < __nenv; i++) { - if (__env[i] - && ((strncmp(__env[i], argv[1], varlen) == 0) - && ((__env[i][varlen] == '\0') - || (__env[i][varlen] == '=')))) { - __env[i] = ep; - return 0; - } - } - - /* - * Wasn't existing variable. Fit into slot. - */ - for (i = 0; i < __nenv-1; i++) { - if (__env[i] == (char *)0) { - __env[i] = ep; - return 0; - } - } - - return KDB_ENVFULL; -} - -static int kdb_check_regs(void) -{ - if (!kdb_current_regs) { - kdb_printf("No current kdb registers." - " You may need to select another task\n"); - return KDB_BADREG; - } - return 0; -} - -/* - * kdbgetaddrarg - This function is responsible for parsing an - * address-expression and returning the value of the expression, - * symbol name, and offset to the caller. - * - * The argument may consist of a numeric value (decimal or - * hexidecimal), a symbol name, a register name (preceded by the - * percent sign), an environment variable with a numeric value - * (preceded by a dollar sign) or a simple arithmetic expression - * consisting of a symbol name, +/-, and a numeric constant value - * (offset). - * Parameters: - * argc - count of arguments in argv - * argv - argument vector - * *nextarg - index to next unparsed argument in argv[] - * regs - Register state at time of KDB entry - * Outputs: - * *value - receives the value of the address-expression - * *offset - receives the offset specified, if any - * *name - receives the symbol name, if any - * *nextarg - index to next unparsed argument in argv[] - * Returns: - * zero is returned on success, a kdb diagnostic code is - * returned on error. - */ -int kdbgetaddrarg(int argc, const char **argv, int *nextarg, - unsigned long *value, long *offset, - char **name) -{ - unsigned long addr; - unsigned long off = 0; - int positive; - int diag; - int found = 0; - char *symname; - char symbol = '\0'; - char *cp; - kdb_symtab_t symtab; - - /* - * Process arguments which follow the following syntax: - * - * symbol | numeric-address [+/- numeric-offset] - * %register - * $environment-variable - */ - - if (*nextarg > argc) - return KDB_ARGCOUNT; - - symname = (char *)argv[*nextarg]; - - /* - * If there is no whitespace between the symbol - * or address and the '+' or '-' symbols, we - * remember the character and replace it with a - * null so the symbol/value can be properly parsed - */ - cp = strpbrk(symname, "+-"); - if (cp != NULL) { - symbol = *cp; - *cp++ = '\0'; - } - - if (symname[0] == '$') { - diag = kdbgetulenv(&symname[1], &addr); - if (diag) - return diag; - } else if (symname[0] == '%') { - diag = kdb_check_regs(); - if (diag) - return diag; - /* Implement register values with % at a later time as it is - * arch optional. - */ - return KDB_NOTIMP; - } else { - found = kdbgetsymval(symname, &symtab); - if (found) { - addr = symtab.sym_start; - } else { - diag = kdbgetularg(argv[*nextarg], &addr); - if (diag) - return diag; - } - } - - if (!found) - found = kdbnearsym(addr, &symtab); - - (*nextarg)++; - - if (name) - *name = symname; - if (value) - *value = addr; - if (offset && name && *name) - *offset = addr - symtab.sym_start; - - if ((*nextarg > argc) - && (symbol == '\0')) - return 0; - - /* - * check for +/- and offset - */ - - if (symbol == '\0') { - if ((argv[*nextarg][0] != '+') - && (argv[*nextarg][0] != '-')) { - /* - * Not our argument. Return. - */ - return 0; - } else { - positive = (argv[*nextarg][0] == '+'); - (*nextarg)++; - } - } else - positive = (symbol == '+'); - - /* - * Now there must be an offset! - */ - if ((*nextarg > argc) - && (symbol == '\0')) { - return KDB_INVADDRFMT; - } - - if (!symbol) { - cp = (char *)argv[*nextarg]; - (*nextarg)++; - } - - diag = kdbgetularg(cp, &off); - if (diag) - return diag; - - if (!positive) - off = -off; - - if (offset) - *offset += off; - - if (value) - *value += off; - - return 0; -} - -static void kdb_cmderror(int diag) -{ - int i; - - if (diag >= 0) { - kdb_printf("no error detected (diagnostic is %d)\n", diag); - return; - } - - for (i = 0; i < __nkdb_err; i++) { - if (kdbmsgs[i].km_diag == diag) { - kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg); - return; - } - } - - kdb_printf("Unknown diag %d\n", -diag); -} - -/* - * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd' - * command which defines one command as a set of other commands, - * terminated by endefcmd. kdb_defcmd processes the initial - * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for - * the following commands until 'endefcmd'. - * Inputs: - * argc argument count - * argv argument vector - * Returns: - * zero for success, a kdb diagnostic if error - */ -struct defcmd_set { - int count; - int usable; - char *name; - char *usage; - char *help; - char **command; -}; -static struct defcmd_set *defcmd_set; -static int defcmd_set_count; -static int defcmd_in_progress; - -/* Forward references */ -static int kdb_exec_defcmd(int argc, const char **argv); - -static int kdb_defcmd2(const char *cmdstr, const char *argv0) -{ - struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; - char **save_command = s->command; - if (strcmp(argv0, "endefcmd") == 0) { - defcmd_in_progress = 0; - if (!s->count) - s->usable = 0; - if (s->usable) - kdb_register(s->name, kdb_exec_defcmd, - s->usage, s->help, 0); - return 0; - } - if (!s->usable) - return KDB_NOTIMP; - s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); - if (!s->command) { - kdb_printf("Could not allocate new kdb_defcmd table for %s\n", - cmdstr); - s->usable = 0; - return KDB_NOTIMP; - } - memcpy(s->command, save_command, s->count * sizeof(*(s->command))); - s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB); - kfree(save_command); - return 0; -} - -static int kdb_defcmd(int argc, const char **argv) -{ - struct defcmd_set *save_defcmd_set = defcmd_set, *s; - if (defcmd_in_progress) { - kdb_printf("kdb: nested defcmd detected, assuming missing " - "endefcmd\n"); - kdb_defcmd2("endefcmd", "endefcmd"); - } - if (argc == 0) { - int i; - for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) { - kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, - s->usage, s->help); - for (i = 0; i < s->count; ++i) - kdb_printf("%s", s->command[i]); - kdb_printf("endefcmd\n"); - } - return 0; - } - if (argc != 3) - return KDB_ARGCOUNT; - defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), - GFP_KDB); - if (!defcmd_set) { - kdb_printf("Could not allocate new defcmd_set entry for %s\n", - argv[1]); - defcmd_set = save_defcmd_set; - return KDB_NOTIMP; - } - memcpy(defcmd_set, save_defcmd_set, - defcmd_set_count * sizeof(*defcmd_set)); - kfree(save_defcmd_set); - s = defcmd_set + defcmd_set_count; - memset(s, 0, sizeof(*s)); - s->usable = 1; - s->name = kdb_strdup(argv[1], GFP_KDB); - s->usage = kdb_strdup(argv[2], GFP_KDB); - s->help = kdb_strdup(argv[3], GFP_KDB); - if (s->usage[0] == '"') { - strcpy(s->usage, s->usage+1); - s->usage[strlen(s->usage)-1] = '\0'; - } - if (s->help[0] == '"') { - strcpy(s->help, s->help+1); - s->help[strlen(s->help)-1] = '\0'; - } - ++defcmd_set_count; - defcmd_in_progress = 1; - return 0; -} - -/* - * kdb_exec_defcmd - Execute the set of commands associated with this - * defcmd name. - * Inputs: - * argc argument count - * argv argument vector - * Returns: - * zero for success, a kdb diagnostic if error - */ -static int kdb_exec_defcmd(int argc, const char **argv) -{ - int i, ret; - struct defcmd_set *s; - if (argc != 0) - return KDB_ARGCOUNT; - for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) { - if (strcmp(s->name, argv[0]) == 0) - break; - } - if (i == defcmd_set_count) { - kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", - argv[0]); - return KDB_NOTIMP; - } - for (i = 0; i < s->count; ++i) { - /* Recursive use of kdb_parse, do not use argv after - * this point */ - argv = NULL; - kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]); - ret = kdb_parse(s->command[i]); - if (ret) - return ret; - } - return 0; -} - -/* Command history */ -#define KDB_CMD_HISTORY_COUNT 32 -#define CMD_BUFLEN 200 /* kdb_printf: max printline - * size == 256 */ -static unsigned int cmd_head, cmd_tail; -static unsigned int cmdptr; -static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN]; -static char cmd_cur[CMD_BUFLEN]; - -/* - * The "str" argument may point to something like | grep xyz - */ -static void parse_grep(const char *str) -{ - int len; - char *cp = (char *)str, *cp2; - - /* sanity check: we should have been called with the \ first */ - if (*cp != '|') - return; - cp++; - while (isspace(*cp)) - cp++; - if (strncmp(cp, "grep ", 5)) { - kdb_printf("invalid 'pipe', see grephelp\n"); - return; - } - cp += 5; - while (isspace(*cp)) - cp++; - cp2 = strchr(cp, '\n'); - if (cp2) - *cp2 = '\0'; /* remove the trailing newline */ - len = strlen(cp); - if (len == 0) { - kdb_printf("invalid 'pipe', see grephelp\n"); - return; - } - /* now cp points to a nonzero length search string */ - if (*cp == '"') { - /* allow it be "x y z" by removing the "'s - there must - be two of them */ - cp++; - cp2 = strchr(cp, '"'); - if (!cp2) { - kdb_printf("invalid quoted string, see grephelp\n"); - return; - } - *cp2 = '\0'; /* end the string where the 2nd " was */ - } - kdb_grep_leading = 0; - if (*cp == '^') { - kdb_grep_leading = 1; - cp++; - } - len = strlen(cp); - kdb_grep_trailing = 0; - if (*(cp+len-1) == '$') { - kdb_grep_trailing = 1; - *(cp+len-1) = '\0'; - } - len = strlen(cp); - if (!len) - return; - if (len >= GREP_LEN) { - kdb_printf("search string too long\n"); - return; - } - strcpy(kdb_grep_string, cp); - kdb_grepping_flag++; - return; -} - -/* - * kdb_parse - Parse the command line, search the command table for a - * matching command and invoke the command function. This - * function may be called recursively, if it is, the second call - * will overwrite argv and cbuf. It is the caller's - * responsibility to save their argv if they recursively call - * kdb_parse(). - * Parameters: - * cmdstr The input command line to be parsed. - * regs The registers at the time kdb was entered. - * Returns: - * Zero for success, a kdb diagnostic if failure. - * Remarks: - * Limited to 20 tokens. - * - * Real rudimentary tokenization. Basically only whitespace - * is considered a token delimeter (but special consideration - * is taken of the '=' sign as used by the 'set' command). - * - * The algorithm used to tokenize the input string relies on - * there being at least one whitespace (or otherwise useless) - * character between tokens as the character immediately following - * the token is altered in-place to a null-byte to terminate the - * token string. - */ - -#define MAXARGC 20 - -int kdb_parse(const char *cmdstr) -{ - static char *argv[MAXARGC]; - static int argc; - static char cbuf[CMD_BUFLEN+2]; - char *cp; - char *cpp, quoted; - kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep; - - /* - * First tokenize the command string. - */ - cp = (char *)cmdstr; - kdb_grepping_flag = check_grep = 0; - - if (KDB_FLAG(CMD_INTERRUPT)) { - /* Previous command was interrupted, newline must not - * repeat the command */ - KDB_FLAG_CLEAR(CMD_INTERRUPT); - KDB_STATE_SET(PAGER); - argc = 0; /* no repeat */ - } - - if (*cp != '\n' && *cp != '\0') { - argc = 0; - cpp = cbuf; - while (*cp) { - /* skip whitespace */ - while (isspace(*cp)) - cp++; - if ((*cp == '\0') || (*cp == '\n') || - (*cp == '#' && !defcmd_in_progress)) - break; - /* special case: check for | grep pattern */ - if (*cp == '|') { - check_grep++; - break; - } - if (cpp >= cbuf + CMD_BUFLEN) { - kdb_printf("kdb_parse: command buffer " - "overflow, command ignored\n%s\n", - cmdstr); - return KDB_NOTFOUND; - } - if (argc >= MAXARGC - 1) { - kdb_printf("kdb_parse: too many arguments, " - "command ignored\n%s\n", cmdstr); - return KDB_NOTFOUND; - } - argv[argc++] = cpp; - escaped = 0; - quoted = '\0'; - /* Copy to next unquoted and unescaped - * whitespace or '=' */ - while (*cp && *cp != '\n' && - (escaped || quoted || !isspace(*cp))) { - if (cpp >= cbuf + CMD_BUFLEN) - break; - if (escaped) { - escaped = 0; - *cpp++ = *cp++; - continue; - } - if (*cp == '\\') { - escaped = 1; - ++cp; - continue; - } - if (*cp == quoted) - quoted = '\0'; - else if (*cp == '\'' || *cp == '"') - quoted = *cp; - *cpp = *cp++; - if (*cpp == '=' && !quoted) - break; - ++cpp; - } - *cpp++ = '\0'; /* Squash a ws or '=' character */ - } - } - if (!argc) - return 0; - if (check_grep) - parse_grep(cp); - if (defcmd_in_progress) { - int result = kdb_defcmd2(cmdstr, argv[0]); - if (!defcmd_in_progress) { - argc = 0; /* avoid repeat on endefcmd */ - *(argv[0]) = '\0'; - } - return result; - } - if (argv[0][0] == '-' && argv[0][1] && - (argv[0][1] < '0' || argv[0][1] > '9')) { - ignore_errors = 1; - ++argv[0]; - } - - for_each_kdbcmd(tp, i) { - if (tp->cmd_name) { - /* - * If this command is allowed to be abbreviated, - * check to see if this is it. - */ - - if (tp->cmd_minlen - && (strlen(argv[0]) <= tp->cmd_minlen)) { - if (strncmp(argv[0], - tp->cmd_name, - tp->cmd_minlen) == 0) { - break; - } - } - - if (strcmp(argv[0], tp->cmd_name) == 0) - break; - } - } - - /* - * If we don't find a command by this name, see if the first - * few characters of this match any of the known commands. - * e.g., md1c20 should match md. - */ - if (i == kdb_max_commands) { - for_each_kdbcmd(tp, i) { - if (tp->cmd_name) { - if (strncmp(argv[0], - tp->cmd_name, - strlen(tp->cmd_name)) == 0) { - break; - } - } - } - } - - if (i < kdb_max_commands) { - int result; - KDB_STATE_SET(CMD); - result = (*tp->cmd_func)(argc-1, (const char **)argv); - if (result && ignore_errors && result > KDB_CMD_GO) - result = 0; - KDB_STATE_CLEAR(CMD); - switch (tp->cmd_repeat) { - case KDB_REPEAT_NONE: - argc = 0; - if (argv[0]) - *(argv[0]) = '\0'; - break; - case KDB_REPEAT_NO_ARGS: - argc = 1; - if (argv[1]) - *(argv[1]) = '\0'; - break; - case KDB_REPEAT_WITH_ARGS: - break; - } - return result; - } - - /* - * If the input with which we were presented does not - * map to an existing command, attempt to parse it as an - * address argument and display the result. Useful for - * obtaining the address of a variable, or the nearest symbol - * to an address contained in a register. - */ - { - unsigned long value; - char *name = NULL; - long offset; - int nextarg = 0; - - if (kdbgetaddrarg(0, (const char **)argv, &nextarg, - &value, &offset, &name)) { - return KDB_NOTFOUND; - } - - kdb_printf("%s = ", argv[0]); - kdb_symbol_print(value, NULL, KDB_SP_DEFAULT); - kdb_printf("\n"); - return 0; - } -} - - -static int handle_ctrl_cmd(char *cmd) -{ -#define CTRL_P 16 -#define CTRL_N 14 - - /* initial situation */ - if (cmd_head == cmd_tail) - return 0; - switch (*cmd) { - case CTRL_P: - if (cmdptr != cmd_tail) - cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); - return 1; - case CTRL_N: - if (cmdptr != cmd_head) - cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; - strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); - return 1; - } - return 0; -} - -/* - * kdb_reboot - This function implements the 'reboot' command. Reboot - * the system immediately, or loop for ever on failure. - */ -static int kdb_reboot(int argc, const char **argv) -{ - emergency_restart(); - kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n"); - while (1) - cpu_relax(); - /* NOTREACHED */ - return 0; -} - -static void kdb_dumpregs(struct pt_regs *regs) -{ - int old_lvl = console_loglevel; - console_loglevel = 15; - kdb_trap_printk++; - show_regs(regs); - kdb_trap_printk--; - kdb_printf("\n"); - console_loglevel = old_lvl; -} - -void kdb_set_current_task(struct task_struct *p) -{ - kdb_current_task = p; - - if (kdb_task_has_cpu(p)) { - kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p)); - return; - } - kdb_current_regs = NULL; -} - -/* - * kdb_local - The main code for kdb. This routine is invoked on a - * specific processor, it is not global. The main kdb() routine - * ensures that only one processor at a time is in this routine. - * This code is called with the real reason code on the first - * entry to a kdb session, thereafter it is called with reason - * SWITCH, even if the user goes back to the original cpu. - * Inputs: - * reason The reason KDB was invoked - * error The hardware-defined error code - * regs The exception frame at time of fault/breakpoint. - * db_result Result code from the break or debug point. - * Returns: - * 0 KDB was invoked for an event which it wasn't responsible - * 1 KDB handled the event for which it was invoked. - * KDB_CMD_GO User typed 'go'. - * KDB_CMD_CPU User switched to another cpu. - * KDB_CMD_SS Single step. - * KDB_CMD_SSB Single step until branch. - */ -static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, - kdb_dbtrap_t db_result) -{ - char *cmdbuf; - int diag; - struct task_struct *kdb_current = - kdb_curr_task(raw_smp_processor_id()); - - KDB_DEBUG_STATE("kdb_local 1", reason); - kdb_go_count = 0; - if (reason == KDB_REASON_DEBUG) { - /* special case below */ - } else { - kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", - kdb_current, kdb_current ? kdb_current->pid : 0); -#if defined(CONFIG_SMP) - kdb_printf("on processor %d ", raw_smp_processor_id()); -#endif - } - - switch (reason) { - case KDB_REASON_DEBUG: - { - /* - * If re-entering kdb after a single step - * command, don't print the message. - */ - switch (db_result) { - case KDB_DB_BPT: - kdb_printf("\nEntering kdb (0x%p, pid %d) ", - kdb_current, kdb_current->pid); -#if defined(CONFIG_SMP) - kdb_printf("on processor %d ", raw_smp_processor_id()); -#endif - kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", - instruction_pointer(regs)); - break; - case KDB_DB_SSB: - /* - * In the midst of ssb command. Just return. - */ - KDB_DEBUG_STATE("kdb_local 3", reason); - return KDB_CMD_SSB; /* Continue with SSB command */ - - break; - case KDB_DB_SS: - break; - case KDB_DB_SSBPT: - KDB_DEBUG_STATE("kdb_local 4", reason); - return 1; /* kdba_db_trap did the work */ - default: - kdb_printf("kdb: Bad result from kdba_db_trap: %d\n", - db_result); - break; - } - - } - break; - case KDB_REASON_ENTER: - if (KDB_STATE(KEYBOARD)) - kdb_printf("due to Keyboard Entry\n"); - else - kdb_printf("due to KDB_ENTER()\n"); - break; - case KDB_REASON_KEYBOARD: - KDB_STATE_SET(KEYBOARD); - kdb_printf("due to Keyboard Entry\n"); - break; - case KDB_REASON_ENTER_SLAVE: - /* drop through, slaves only get released via cpu switch */ - case KDB_REASON_SWITCH: - kdb_printf("due to cpu switch\n"); - break; - case KDB_REASON_OOPS: - kdb_printf("Oops: %s\n", kdb_diemsg); - kdb_printf("due to oops @ " kdb_machreg_fmt "\n", - instruction_pointer(regs)); - kdb_dumpregs(regs); - break; - case KDB_REASON_NMI: - kdb_printf("due to NonMaskable Interrupt @ " - kdb_machreg_fmt "\n", - instruction_pointer(regs)); - kdb_dumpregs(regs); - break; - case KDB_REASON_SSTEP: - case KDB_REASON_BREAK: - kdb_printf("due to %s @ " kdb_machreg_fmt "\n", - reason == KDB_REASON_BREAK ? - "Breakpoint" : "SS trap", instruction_pointer(regs)); - /* - * Determine if this breakpoint is one that we - * are interested in. - */ - if (db_result != KDB_DB_BPT) { - kdb_printf("kdb: error return from kdba_bp_trap: %d\n", - db_result); - KDB_DEBUG_STATE("kdb_local 6", reason); - return 0; /* Not for us, dismiss it */ - } - break; - case KDB_REASON_RECURSE: - kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n", - instruction_pointer(regs)); - break; - default: - kdb_printf("kdb: unexpected reason code: %d\n", reason); - KDB_DEBUG_STATE("kdb_local 8", reason); - return 0; /* Not for us, dismiss it */ - } - - while (1) { - /* - * Initialize pager context. - */ - kdb_nextline = 1; - KDB_STATE_CLEAR(SUPPRESS); - - cmdbuf = cmd_cur; - *cmdbuf = '\0'; - *(cmd_hist[cmd_head]) = '\0'; - - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - -do_full_getstr: -#if defined(CONFIG_SMP) - snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), - raw_smp_processor_id()); -#else - snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT")); -#endif - if (defcmd_in_progress) - strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); - - /* - * Fetch command from keyboard - */ - cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str); - if (*cmdbuf != '\n') { - if (*cmdbuf < 32) { - if (cmdptr == cmd_head) { - strncpy(cmd_hist[cmd_head], cmd_cur, - CMD_BUFLEN); - *(cmd_hist[cmd_head] + - strlen(cmd_hist[cmd_head])-1) = '\0'; - } - if (!handle_ctrl_cmd(cmdbuf)) - *(cmd_cur+strlen(cmd_cur)-1) = '\0'; - cmdbuf = cmd_cur; - goto do_full_getstr; - } else { - strncpy(cmd_hist[cmd_head], cmd_cur, - CMD_BUFLEN); - } - - cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT; - if (cmd_head == cmd_tail) - cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT; - } - - cmdptr = cmd_head; - diag = kdb_parse(cmdbuf); - if (diag == KDB_NOTFOUND) { - kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); - diag = 0; - } - if (diag == KDB_CMD_GO - || diag == KDB_CMD_CPU - || diag == KDB_CMD_SS - || diag == KDB_CMD_SSB - || diag == KDB_CMD_KGDB) - break; - - if (diag) - kdb_cmderror(diag); - } - KDB_DEBUG_STATE("kdb_local 9", diag); - return diag; -} - - -/* - * kdb_print_state - Print the state data for the current processor - * for debugging. - * Inputs: - * text Identifies the debug point - * value Any integer value to be printed, e.g. reason code. - */ -void kdb_print_state(const char *text, int value) -{ - kdb_printf("state: %s cpu %d value %d initial %d state %x\n", - text, raw_smp_processor_id(), value, kdb_initial_cpu, - kdb_state); -} - -/* - * kdb_main_loop - After initial setup and assignment of the - * controlling cpu, all cpus are in this loop. One cpu is in - * control and will issue the kdb prompt, the others will spin - * until 'go' or cpu switch. - * - * To get a consistent view of the kernel stacks for all - * processes, this routine is invoked from the main kdb code via - * an architecture specific routine. kdba_main_loop is - * responsible for making the kernel stacks consistent for all - * processes, there should be no difference between a blocked - * process and a running process as far as kdb is concerned. - * Inputs: - * reason The reason KDB was invoked - * error The hardware-defined error code - * reason2 kdb's current reason code. - * Initially error but can change - * according to kdb state. - * db_result Result code from break or debug point. - * regs The exception frame at time of fault/breakpoint. - * should always be valid. - * Returns: - * 0 KDB was invoked for an event which it wasn't responsible - * 1 KDB handled the event for which it was invoked. - */ -int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, - kdb_dbtrap_t db_result, struct pt_regs *regs) -{ - int result = 1; - /* Stay in kdb() until 'go', 'ss[b]' or an error */ - while (1) { - /* - * All processors except the one that is in control - * will spin here. - */ - KDB_DEBUG_STATE("kdb_main_loop 1", reason); - while (KDB_STATE(HOLD_CPU)) { - /* state KDB is turned off by kdb_cpu to see if the - * other cpus are still live, each cpu in this loop - * turns it back on. - */ - if (!KDB_STATE(KDB)) - KDB_STATE_SET(KDB); - } - - KDB_STATE_CLEAR(SUPPRESS); - KDB_DEBUG_STATE("kdb_main_loop 2", reason); - if (KDB_STATE(LEAVING)) - break; /* Another cpu said 'go' */ - /* Still using kdb, this processor is in control */ - result = kdb_local(reason2, error, regs, db_result); - KDB_DEBUG_STATE("kdb_main_loop 3", result); - - if (result == KDB_CMD_CPU) - break; - - if (result == KDB_CMD_SS) { - KDB_STATE_SET(DOING_SS); - break; - } - - if (result == KDB_CMD_SSB) { - KDB_STATE_SET(DOING_SS); - KDB_STATE_SET(DOING_SSB); - break; - } - - if (result == KDB_CMD_KGDB) { - if (!KDB_STATE(DOING_KGDB)) - kdb_printf("Entering please attach debugger " - "or use $D#44+ or $3#33\n"); - break; - } - if (result && result != 1 && result != KDB_CMD_GO) - kdb_printf("\nUnexpected kdb_local return code %d\n", - result); - KDB_DEBUG_STATE("kdb_main_loop 4", reason); - break; - } - if (KDB_STATE(DOING_SS)) - KDB_STATE_CLEAR(SSBPT); - - return result; -} - -/* - * kdb_mdr - This function implements the guts of the 'mdr', memory - * read command. - * mdr , - * Inputs: - * addr Start address - * count Number of bytes - * Returns: - * Always 0. Any errors are detected and printed by kdb_getarea. - */ -static int kdb_mdr(unsigned long addr, unsigned int count) -{ - unsigned char c; - while (count--) { - if (kdb_getarea(c, addr)) - return 0; - kdb_printf("%02x", c); - addr++; - } - kdb_printf("\n"); - return 0; -} - -/* - * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4', - * 'md8' 'mdr' and 'mds' commands. - * - * md|mds [ [ []]] - * mdWcN [ [ []]] - * where W = is the width (1, 2, 4 or 8) and N is the count. - * for eg., md1c20 reads 20 bytes, 1 at a time. - * mdr , - */ -static void kdb_md_line(const char *fmtstr, unsigned long addr, - int symbolic, int nosect, int bytesperword, - int num, int repeat, int phys) -{ - /* print just one line of data */ - kdb_symtab_t symtab; - char cbuf[32]; - char *c = cbuf; - int i; - unsigned long word; - - memset(cbuf, '\0', sizeof(cbuf)); - if (phys) - kdb_printf("phys " kdb_machreg_fmt0 " ", addr); - else - kdb_printf(kdb_machreg_fmt0 " ", addr); - - for (i = 0; i < num && repeat--; i++) { - if (phys) { - if (kdb_getphysword(&word, addr, bytesperword)) - break; - } else if (kdb_getword(&word, addr, bytesperword)) - break; - kdb_printf(fmtstr, word); - if (symbolic) - kdbnearsym(word, &symtab); - else - memset(&symtab, 0, sizeof(symtab)); - if (symtab.sym_name) { - kdb_symbol_print(word, &symtab, 0); - if (!nosect) { - kdb_printf("\n"); - kdb_printf(" %s %s " - kdb_machreg_fmt " " - kdb_machreg_fmt " " - kdb_machreg_fmt, symtab.mod_name, - symtab.sec_name, symtab.sec_start, - symtab.sym_start, symtab.sym_end); - } - addr += bytesperword; - } else { - union { - u64 word; - unsigned char c[8]; - } wc; - unsigned char *cp; -#ifdef __BIG_ENDIAN - cp = wc.c + 8 - bytesperword; -#else - cp = wc.c; -#endif - wc.word = word; -#define printable_char(c) \ - ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) - switch (bytesperword) { - case 8: - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 4; - case 4: - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 2; - case 2: - *c++ = printable_char(*cp++); - addr++; - case 1: - *c++ = printable_char(*cp++); - addr++; - break; - } -#undef printable_char - } - } - kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1), - " ", cbuf); -} - -static int kdb_md(int argc, const char **argv) -{ - static unsigned long last_addr; - static int last_radix, last_bytesperword, last_repeat; - int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat; - int nosect = 0; - char fmtchar, fmtstr[64]; - unsigned long addr; - unsigned long word; - long offset = 0; - int symbolic = 0; - int valid = 0; - int phys = 0; - - kdbgetintenv("MDCOUNT", &mdcount); - kdbgetintenv("RADIX", &radix); - kdbgetintenv("BYTESPERWORD", &bytesperword); - - /* Assume 'md ' and start with environment values */ - repeat = mdcount * 16 / bytesperword; - - if (strcmp(argv[0], "mdr") == 0) { - if (argc != 2) - return KDB_ARGCOUNT; - valid = 1; - } else if (isdigit(argv[0][2])) { - bytesperword = (int)(argv[0][2] - '0'); - if (bytesperword == 0) { - bytesperword = last_bytesperword; - if (bytesperword == 0) - bytesperword = 4; - } - last_bytesperword = bytesperword; - repeat = mdcount * 16 / bytesperword; - if (!argv[0][3]) - valid = 1; - else if (argv[0][3] == 'c' && argv[0][4]) { - char *p; - repeat = simple_strtoul(argv[0] + 4, &p, 10); - mdcount = ((repeat * bytesperword) + 15) / 16; - valid = !*p; - } - last_repeat = repeat; - } else if (strcmp(argv[0], "md") == 0) - valid = 1; - else if (strcmp(argv[0], "mds") == 0) - valid = 1; - else if (strcmp(argv[0], "mdp") == 0) { - phys = valid = 1; - } - if (!valid) - return KDB_NOTFOUND; - - if (argc == 0) { - if (last_addr == 0) - return KDB_ARGCOUNT; - addr = last_addr; - radix = last_radix; - bytesperword = last_bytesperword; - repeat = last_repeat; - mdcount = ((repeat * bytesperword) + 15) / 16; - } - - if (argc) { - unsigned long val; - int diag, nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, - &offset, NULL); - if (diag) - return diag; - if (argc > nextarg+2) - return KDB_ARGCOUNT; - - if (argc >= nextarg) { - diag = kdbgetularg(argv[nextarg], &val); - if (!diag) { - mdcount = (int) val; - repeat = mdcount * 16 / bytesperword; - } - } - if (argc >= nextarg+1) { - diag = kdbgetularg(argv[nextarg+1], &val); - if (!diag) - radix = (int) val; - } - } - - if (strcmp(argv[0], "mdr") == 0) - return kdb_mdr(addr, mdcount); - - switch (radix) { - case 10: - fmtchar = 'd'; - break; - case 16: - fmtchar = 'x'; - break; - case 8: - fmtchar = 'o'; - break; - default: - return KDB_BADRADIX; - } - - last_radix = radix; - - if (bytesperword > KDB_WORD_SIZE) - return KDB_BADWIDTH; - - switch (bytesperword) { - case 8: - sprintf(fmtstr, "%%16.16l%c ", fmtchar); - break; - case 4: - sprintf(fmtstr, "%%8.8l%c ", fmtchar); - break; - case 2: - sprintf(fmtstr, "%%4.4l%c ", fmtchar); - break; - case 1: - sprintf(fmtstr, "%%2.2l%c ", fmtchar); - break; - default: - return KDB_BADWIDTH; - } - - last_repeat = repeat; - last_bytesperword = bytesperword; - - if (strcmp(argv[0], "mds") == 0) { - symbolic = 1; - /* Do not save these changes as last_*, they are temporary mds - * overrides. - */ - bytesperword = KDB_WORD_SIZE; - repeat = mdcount; - kdbgetintenv("NOSECT", &nosect); - } - - /* Round address down modulo BYTESPERWORD */ - - addr &= ~(bytesperword-1); - - while (repeat > 0) { - unsigned long a; - int n, z, num = (symbolic ? 1 : (16 / bytesperword)); - - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) { - if (phys) { - if (kdb_getphysword(&word, a, bytesperword) - || word) - break; - } else if (kdb_getword(&word, a, bytesperword) || word) - break; - } - n = min(num, repeat); - kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword, - num, repeat, phys); - addr += bytesperword * n; - repeat -= n; - z = (z + num - 1) / num; - if (z > 2) { - int s = num * (z-2); - kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0 - " zero suppressed\n", - addr, addr + bytesperword * s - 1); - addr += bytesperword * s; - repeat -= s; - } - } - last_addr = addr; - - return 0; -} - -/* - * kdb_mm - This function implements the 'mm' command. - * mm address-expression new-value - * Remarks: - * mm works on machine words, mmW works on bytes. - */ -static int kdb_mm(int argc, const char **argv) -{ - int diag; - unsigned long addr; - long offset = 0; - unsigned long contents; - int nextarg; - int width; - - if (argv[0][2] && !isdigit(argv[0][2])) - return KDB_NOTFOUND; - - if (argc < 2) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - - if (nextarg > argc) - return KDB_ARGCOUNT; - diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL); - if (diag) - return diag; - - if (nextarg != argc + 1) - return KDB_ARGCOUNT; - - width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE); - diag = kdb_putword(addr, contents, width); - if (diag) - return diag; - - kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents); - - return 0; -} - -/* - * kdb_go - This function implements the 'go' command. - * go [address-expression] - */ -static int kdb_go(int argc, const char **argv) -{ - unsigned long addr; - int diag; - int nextarg; - long offset; - - if (raw_smp_processor_id() != kdb_initial_cpu) { - kdb_printf("go must execute on the entry cpu, " - "please use \"cpu %d\" and then execute go\n", - kdb_initial_cpu); - return KDB_BADCPUNUM; - } - if (argc == 1) { - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, - &addr, &offset, NULL); - if (diag) - return diag; - } else if (argc) { - return KDB_ARGCOUNT; - } - - diag = KDB_CMD_GO; - if (KDB_FLAG(CATASTROPHIC)) { - kdb_printf("Catastrophic error detected\n"); - kdb_printf("kdb_continue_catastrophic=%d, ", - kdb_continue_catastrophic); - if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) { - kdb_printf("type go a second time if you really want " - "to continue\n"); - return 0; - } - if (kdb_continue_catastrophic == 2) { - kdb_printf("forcing reboot\n"); - kdb_reboot(0, NULL); - } - kdb_printf("attempting to continue\n"); - } - return diag; -} - -/* - * kdb_rd - This function implements the 'rd' command. - */ -static int kdb_rd(int argc, const char **argv) -{ - int len = kdb_check_regs(); -#if DBG_MAX_REG_NUM > 0 - int i; - char *rname; - int rsize; - u64 reg64; - u32 reg32; - u16 reg16; - u8 reg8; - - if (len) - return len; - - for (i = 0; i < DBG_MAX_REG_NUM; i++) { - rsize = dbg_reg_def[i].size * 2; - if (rsize > 16) - rsize = 2; - if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { - len = 0; - kdb_printf("\n"); - } - if (len) - len += kdb_printf(" "); - switch(dbg_reg_def[i].size * 8) { - case 8: - rname = dbg_get_reg(i, ®8, kdb_current_regs); - if (!rname) - break; - len += kdb_printf("%s: %02x", rname, reg8); - break; - case 16: - rname = dbg_get_reg(i, ®16, kdb_current_regs); - if (!rname) - break; - len += kdb_printf("%s: %04x", rname, reg16); - break; - case 32: - rname = dbg_get_reg(i, ®32, kdb_current_regs); - if (!rname) - break; - len += kdb_printf("%s: %08x", rname, reg32); - break; - case 64: - rname = dbg_get_reg(i, ®64, kdb_current_regs); - if (!rname) - break; - len += kdb_printf("%s: %016llx", rname, reg64); - break; - default: - len += kdb_printf("%s: ??", dbg_reg_def[i].name); - } - } - kdb_printf("\n"); -#else - if (len) - return len; - - kdb_dumpregs(kdb_current_regs); -#endif - return 0; -} - -/* - * kdb_rm - This function implements the 'rm' (register modify) command. - * rm register-name new-contents - * Remarks: - * Allows register modification with the same restrictions as gdb - */ -static int kdb_rm(int argc, const char **argv) -{ -#if DBG_MAX_REG_NUM > 0 - int diag; - const char *rname; - int i; - u64 reg64; - u32 reg32; - u16 reg16; - u8 reg8; - - if (argc != 2) - return KDB_ARGCOUNT; - /* - * Allow presence or absence of leading '%' symbol. - */ - rname = argv[1]; - if (*rname == '%') - rname++; - - diag = kdbgetu64arg(argv[2], ®64); - if (diag) - return diag; - - diag = kdb_check_regs(); - if (diag) - return diag; - - diag = KDB_BADREG; - for (i = 0; i < DBG_MAX_REG_NUM; i++) { - if (strcmp(rname, dbg_reg_def[i].name) == 0) { - diag = 0; - break; - } - } - if (!diag) { - switch(dbg_reg_def[i].size * 8) { - case 8: - reg8 = reg64; - dbg_set_reg(i, ®8, kdb_current_regs); - break; - case 16: - reg16 = reg64; - dbg_set_reg(i, ®16, kdb_current_regs); - break; - case 32: - reg32 = reg64; - dbg_set_reg(i, ®32, kdb_current_regs); - break; - case 64: - dbg_set_reg(i, ®64, kdb_current_regs); - break; - } - } - return diag; -#else - kdb_printf("ERROR: Register set currently not implemented\n"); - return 0; -#endif -} - -#if defined(CONFIG_MAGIC_SYSRQ) -/* - * kdb_sr - This function implements the 'sr' (SYSRQ key) command - * which interfaces to the soi-disant MAGIC SYSRQ functionality. - * sr - */ -static int kdb_sr(int argc, const char **argv) -{ - if (argc != 1) - return KDB_ARGCOUNT; - kdb_trap_printk++; - __handle_sysrq(*argv[1], false); - kdb_trap_printk--; - - return 0; -} -#endif /* CONFIG_MAGIC_SYSRQ */ - -/* - * kdb_ef - This function implements the 'regs' (display exception - * frame) command. This command takes an address and expects to - * find an exception frame at that address, formats and prints - * it. - * regs address-expression - * Remarks: - * Not done yet. - */ -static int kdb_ef(int argc, const char **argv) -{ - int diag; - unsigned long addr; - long offset; - int nextarg; - - if (argc != 1) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - show_regs((struct pt_regs *)addr); - return 0; -} - -#if defined(CONFIG_MODULES) -/* - * kdb_lsmod - This function implements the 'lsmod' command. Lists - * currently loaded kernel modules. - * Mostly taken from userland lsmod. - */ -static int kdb_lsmod(int argc, const char **argv) -{ - struct module *mod; - - if (argc != 0) - return KDB_ARGCOUNT; - - kdb_printf("Module Size modstruct Used by\n"); - list_for_each_entry(mod, kdb_modules, list) { - - kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); -#ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); -#endif - if (mod->state == MODULE_STATE_GOING) - kdb_printf(" (Unloading)"); - else if (mod->state == MODULE_STATE_COMING) - kdb_printf(" (Loading)"); - else - kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); - -#ifdef CONFIG_MODULE_UNLOAD - { - struct module_use *use; - kdb_printf(" [ "); - list_for_each_entry(use, &mod->source_list, - source_list) - kdb_printf("%s ", use->target->name); - kdb_printf("]\n"); - } -#endif - } - - return 0; -} - -#endif /* CONFIG_MODULES */ - -/* - * kdb_env - This function implements the 'env' command. Display the - * current environment variables. - */ - -static int kdb_env(int argc, const char **argv) -{ - int i; - - for (i = 0; i < __nenv; i++) { - if (__env[i]) - kdb_printf("%s\n", __env[i]); - } - - if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); - - return 0; -} - -#ifdef CONFIG_PRINTK -/* - * kdb_dmesg - This function implements the 'dmesg' command to display - * the contents of the syslog buffer. - * dmesg [lines] [adjust] - */ -static int kdb_dmesg(int argc, const char **argv) -{ - char *syslog_data[4], *start, *end, c = '\0', *p; - int diag, logging, logsize, lines = 0, adjust = 0, n; - - if (argc > 2) - return KDB_ARGCOUNT; - if (argc) { - char *cp; - lines = simple_strtol(argv[1], &cp, 0); - if (*cp) - lines = 0; - if (argc > 1) { - adjust = simple_strtoul(argv[2], &cp, 0); - if (*cp || adjust < 0) - adjust = 0; - } - } - - /* disable LOGGING if set */ - diag = kdbgetintenv("LOGGING", &logging); - if (!diag && logging) { - const char *setargs[] = { "set", "LOGGING", "0" }; - kdb_set(2, setargs); - } - - /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] - * logical start, end+1. */ - kdb_syslog_data(syslog_data); - if (syslog_data[2] == syslog_data[3]) - return 0; - logsize = syslog_data[1] - syslog_data[0]; - start = syslog_data[2]; - end = syslog_data[3]; -#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) - for (n = 0, p = start; p < end; ++p) { - c = *KDB_WRAP(p); - if (c == '\n') - ++n; - } - if (c != '\n') - ++n; - if (lines < 0) { - if (adjust >= n) - kdb_printf("buffer only contains %d lines, nothing " - "printed\n", n); - else if (adjust - lines >= n) - kdb_printf("buffer only contains %d lines, last %d " - "lines printed\n", n, n - adjust); - if (adjust) { - for (; start < end && adjust; ++start) { - if (*KDB_WRAP(start) == '\n') - --adjust; - } - if (start < end) - ++start; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - ++lines; - } - end = p; - } else if (lines > 0) { - int skip = n - (adjust + lines); - if (adjust >= n) { - kdb_printf("buffer only contains %d lines, " - "nothing printed\n", n); - skip = n; - } else if (skip < 0) { - lines += skip; - skip = 0; - kdb_printf("buffer only contains %d lines, first " - "%d lines printed\n", n, lines); - } - for (; start < end && skip; ++start) { - if (*KDB_WRAP(start) == '\n') - --skip; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - --lines; - } - end = p; - } - /* Do a line at a time (max 200 chars) to reduce protocol overhead */ - c = '\n'; - while (start != end) { - char buf[201]; - p = buf; - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - while (start < end && (c = *KDB_WRAP(start)) && - (p - buf) < sizeof(buf)-1) { - ++start; - *p++ = c; - if (c == '\n') - break; - } - *p = '\0'; - kdb_printf("%s", buf); - } - if (c != '\n') - kdb_printf("\n"); - - return 0; -} -#endif /* CONFIG_PRINTK */ -/* - * kdb_cpu - This function implements the 'cpu' command. - * cpu [] - * Returns: - * KDB_CMD_CPU for success, a kdb diagnostic if error - */ -static void kdb_cpu_status(void) -{ - int i, start_cpu, first_print = 1; - char state, prev_state = '?'; - - kdb_printf("Currently on cpu %d\n", raw_smp_processor_id()); - kdb_printf("Available cpus: "); - for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) { - state = 'F'; /* cpu is offline */ - } else { - state = ' '; /* cpu is responding to kdb */ - if (kdb_task_state_char(KDB_TSK(i)) == 'I') - state = 'I'; /* idle task */ - } - if (state != prev_state) { - if (prev_state != '?') { - if (!first_print) - kdb_printf(", "); - first_print = 0; - kdb_printf("%d", start_cpu); - if (start_cpu < i-1) - kdb_printf("-%d", i-1); - if (prev_state != ' ') - kdb_printf("(%c)", prev_state); - } - prev_state = state; - start_cpu = i; - } - } - /* print the trailing cpus, ignoring them if they are all offline */ - if (prev_state != 'F') { - if (!first_print) - kdb_printf(", "); - kdb_printf("%d", start_cpu); - if (start_cpu < i-1) - kdb_printf("-%d", i-1); - if (prev_state != ' ') - kdb_printf("(%c)", prev_state); - } - kdb_printf("\n"); -} - -static int kdb_cpu(int argc, const char **argv) -{ - unsigned long cpunum; - int diag; - - if (argc == 0) { - kdb_cpu_status(); - return 0; - } - - if (argc != 1) - return KDB_ARGCOUNT; - - diag = kdbgetularg(argv[1], &cpunum); - if (diag) - return diag; - - /* - * Validate cpunum - */ - if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) - return KDB_BADCPUNUM; - - dbg_switch_cpu = cpunum; - - /* - * Switch to other cpu - */ - return KDB_CMD_CPU; -} - -/* The user may not realize that ps/bta with no parameters does not print idle - * or sleeping system daemon processes, so tell them how many were suppressed. - */ -void kdb_ps_suppressed(void) -{ - int idle = 0, daemon = 0; - unsigned long mask_I = kdb_task_state_string("I"), - mask_M = kdb_task_state_string("M"); - unsigned long cpu; - const struct task_struct *p, *g; - for_each_online_cpu(cpu) { - p = kdb_curr_task(cpu); - if (kdb_task_state(p, mask_I)) - ++idle; - } - kdb_do_each_thread(g, p) { - if (kdb_task_state(p, mask_M)) - ++daemon; - } kdb_while_each_thread(g, p); - if (idle || daemon) { - if (idle) - kdb_printf("%d idle process%s (state I)%s\n", - idle, idle == 1 ? "" : "es", - daemon ? " and " : ""); - if (daemon) - kdb_printf("%d sleeping system daemon (state M) " - "process%s", daemon, - daemon == 1 ? "" : "es"); - kdb_printf(" suppressed,\nuse 'ps A' to see all.\n"); - } -} - -/* - * kdb_ps - This function implements the 'ps' command which shows a - * list of the active processes. - * ps [DRSTCZEUIMA] All processes, optionally filtered by state - */ -void kdb_ps1(const struct task_struct *p) -{ - int cpu; - unsigned long tmp; - - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) - return; - - cpu = kdb_process_cpu(p); - kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", - (void *)p, p->pid, p->parent->pid, - kdb_task_has_cpu(p), kdb_process_cpu(p), - kdb_task_state_char(p), - (void *)(&p->thread), - p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ', - p->comm); - if (kdb_task_has_cpu(p)) { - if (!KDB_TSK(cpu)) { - kdb_printf(" Error: no saved data for this cpu\n"); - } else { - if (KDB_TSK(cpu) != p) - kdb_printf(" Error: does not match running " - "process table (0x%p)\n", KDB_TSK(cpu)); - } - } -} - -static int kdb_ps(int argc, const char **argv) -{ - struct task_struct *g, *p; - unsigned long mask, cpu; - - if (argc == 0) - kdb_ps_suppressed(); - kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n", - (int)(2*sizeof(void *))+2, "Task Addr", - (int)(2*sizeof(void *))+2, "Thread"); - mask = kdb_task_state_string(argc ? argv[1] : NULL); - /* Run the active tasks first */ - for_each_online_cpu(cpu) { - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - p = kdb_curr_task(cpu); - if (kdb_task_state(p, mask)) - kdb_ps1(p); - } - kdb_printf("\n"); - /* Now the real tasks */ - kdb_do_each_thread(g, p) { - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - if (kdb_task_state(p, mask)) - kdb_ps1(p); - } kdb_while_each_thread(g, p); - - return 0; -} - -/* - * kdb_pid - This function implements the 'pid' command which switches - * the currently active process. - * pid [ | R] - */ -static int kdb_pid(int argc, const char **argv) -{ - struct task_struct *p; - unsigned long val; - int diag; - - if (argc > 1) - return KDB_ARGCOUNT; - - if (argc) { - if (strcmp(argv[1], "R") == 0) { - p = KDB_TSK(kdb_initial_cpu); - } else { - diag = kdbgetularg(argv[1], &val); - if (diag) - return KDB_BADINT; - - p = find_task_by_pid_ns((pid_t)val, &init_pid_ns); - if (!p) { - kdb_printf("No task with pid=%d\n", (pid_t)val); - return 0; - } - } - kdb_set_current_task(p); - } - kdb_printf("KDB current process is %s(pid=%d)\n", - kdb_current_task->comm, - kdb_current_task->pid); - - return 0; -} - -/* - * kdb_ll - This function implements the 'll' command which follows a - * linked list and executes an arbitrary command for each - * element. - */ -static int kdb_ll(int argc, const char **argv) -{ - int diag = 0; - unsigned long addr; - long offset = 0; - unsigned long va; - unsigned long linkoffset; - int nextarg; - const char *command; - - if (argc != 3) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - - diag = kdbgetularg(argv[2], &linkoffset); - if (diag) - return diag; - - /* - * Using the starting address as - * the first element in the list, and assuming that - * the list ends with a null pointer. - */ - - va = addr; - command = kdb_strdup(argv[3], GFP_KDB); - if (!command) { - kdb_printf("%s: cannot duplicate command\n", __func__); - return 0; - } - /* Recursive use of kdb_parse, do not use argv after this point */ - argv = NULL; - - while (va) { - char buf[80]; - - if (KDB_FLAG(CMD_INTERRUPT)) - goto out; - - sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); - diag = kdb_parse(buf); - if (diag) - goto out; - - addr = va + linkoffset; - if (kdb_getword(&va, addr, sizeof(va))) - goto out; - } - -out: - kfree(command); - return diag; -} - -static int kdb_kgdb(int argc, const char **argv) -{ - return KDB_CMD_KGDB; -} - -/* - * kdb_help - This function implements the 'help' and '?' commands. - */ -static int kdb_help(int argc, const char **argv) -{ - kdbtab_t *kt; - int i; - - kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description"); - kdb_printf("-----------------------------" - "-----------------------------\n"); - for_each_kdbcmd(kt, i) { - if (kt->cmd_name) - kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, - kt->cmd_usage, kt->cmd_help); - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - } - return 0; -} - -/* - * kdb_kill - This function implements the 'kill' commands. - */ -static int kdb_kill(int argc, const char **argv) -{ - long sig, pid; - char *endp; - struct task_struct *p; - struct siginfo info; - - if (argc != 2) - return KDB_ARGCOUNT; - - sig = simple_strtol(argv[1], &endp, 0); - if (*endp) - return KDB_BADINT; - if (sig >= 0) { - kdb_printf("Invalid signal parameter.<-signal>\n"); - return 0; - } - sig = -sig; - - pid = simple_strtol(argv[2], &endp, 0); - if (*endp) - return KDB_BADINT; - if (pid <= 0) { - kdb_printf("Process ID must be large than 0.\n"); - return 0; - } - - /* Find the process. */ - p = find_task_by_pid_ns(pid, &init_pid_ns); - if (!p) { - kdb_printf("The specified process isn't found.\n"); - return 0; - } - p = p->group_leader; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = pid; /* same capabilities as process being signalled */ - info.si_uid = 0; /* kdb has root authority */ - kdb_send_sig_info(p, &info); - return 0; -} - -struct kdb_tm { - int tm_sec; /* seconds */ - int tm_min; /* minutes */ - int tm_hour; /* hours */ - int tm_mday; /* day of the month */ - int tm_mon; /* month */ - int tm_year; /* year */ -}; - -static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) -{ - /* This will work from 1970-2099, 2100 is not a leap year */ - static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, - 31, 30, 31, 30, 31 }; - memset(tm, 0, sizeof(*tm)); - tm->tm_sec = tv->tv_sec % (24 * 60 * 60); - tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + - (2 * 365 + 1); /* shift base from 1970 to 1968 */ - tm->tm_min = tm->tm_sec / 60 % 60; - tm->tm_hour = tm->tm_sec / 60 / 60; - tm->tm_sec = tm->tm_sec % 60; - tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); - tm->tm_mday %= (4*365+1); - mon_day[1] = 29; - while (tm->tm_mday >= mon_day[tm->tm_mon]) { - tm->tm_mday -= mon_day[tm->tm_mon]; - if (++tm->tm_mon == 12) { - tm->tm_mon = 0; - ++tm->tm_year; - mon_day[1] = 28; - } - } - ++tm->tm_mday; -} - -/* - * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). - * I cannot call that code directly from kdb, it has an unconditional - * cli()/sti() and calls routines that take locks which can stop the debugger. - */ -static void kdb_sysinfo(struct sysinfo *val) -{ - struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); - memset(val, 0, sizeof(*val)); - val->uptime = uptime.tv_sec; - val->loads[0] = avenrun[0]; - val->loads[1] = avenrun[1]; - val->loads[2] = avenrun[2]; - val->procs = nr_threads-1; - si_meminfo(val); - - return; -} - -/* - * kdb_summary - This function implements the 'summary' command. - */ -static int kdb_summary(int argc, const char **argv) -{ - struct timespec now; - struct kdb_tm tm; - struct sysinfo val; - - if (argc) - return KDB_ARGCOUNT; - - kdb_printf("sysname %s\n", init_uts_ns.name.sysname); - kdb_printf("release %s\n", init_uts_ns.name.release); - kdb_printf("version %s\n", init_uts_ns.name.version); - kdb_printf("machine %s\n", init_uts_ns.name.machine); - kdb_printf("nodename %s\n", init_uts_ns.name.nodename); - kdb_printf("domainname %s\n", init_uts_ns.name.domainname); - kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - - now = __current_kernel_time(); - kdb_gmtime(&now, &tm); - kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " - "tz_minuteswest %d\n", - 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec, - sys_tz.tz_minuteswest); - - kdb_sysinfo(&val); - kdb_printf("uptime "); - if (val.uptime > (24*60*60)) { - int days = val.uptime / (24*60*60); - val.uptime %= (24*60*60); - kdb_printf("%d day%s ", days, days == 1 ? "" : "s"); - } - kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", - LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), - LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), - LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC - /* Display in kilobytes */ -#define K(x) ((x) << (PAGE_SHIFT - 10)) - kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" - "Buffers: %8lu kB\n", - val.totalram, val.freeram, val.bufferram); - return 0; -} - -/* - * kdb_per_cpu - This function implements the 'per_cpu' command. - */ -static int kdb_per_cpu(int argc, const char **argv) -{ - char fmtstr[64]; - int cpu, diag, nextarg = 1; - unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; - - if (argc < 1 || argc > 3) - return KDB_ARGCOUNT; - - diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); - if (diag) - return diag; - - if (argc >= 2) { - diag = kdbgetularg(argv[2], &bytesperword); - if (diag) - return diag; - } - if (!bytesperword) - bytesperword = KDB_WORD_SIZE; - else if (bytesperword > KDB_WORD_SIZE) - return KDB_BADWIDTH; - sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword)); - if (argc >= 3) { - diag = kdbgetularg(argv[3], &whichcpu); - if (diag) - return diag; - if (!cpu_online(whichcpu)) { - kdb_printf("cpu %ld is not online\n", whichcpu); - return KDB_BADCPUNUM; - } - } - - /* Most architectures use __per_cpu_offset[cpu], some use - * __per_cpu_offset(cpu), smp has no __per_cpu_offset. - */ -#ifdef __per_cpu_offset -#define KDB_PCU(cpu) __per_cpu_offset(cpu) -#else -#ifdef CONFIG_SMP -#define KDB_PCU(cpu) __per_cpu_offset[cpu] -#else -#define KDB_PCU(cpu) 0 -#endif -#endif - for_each_online_cpu(cpu) { - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - - if (whichcpu != ~0UL && whichcpu != cpu) - continue; - addr = symaddr + KDB_PCU(cpu); - diag = kdb_getword(&val, addr, bytesperword); - if (diag) { - kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " - "read, diag=%d\n", cpu, addr, diag); - continue; - } - kdb_printf("%5d ", cpu); - kdb_md_line(fmtstr, addr, - bytesperword == KDB_WORD_SIZE, - 1, bytesperword, 1, 1, 0); - } -#undef KDB_PCU - return 0; -} - -/* - * display help for the use of cmd | grep pattern - */ -static int kdb_grep_help(int argc, const char **argv) -{ - kdb_printf("Usage of cmd args | grep pattern:\n"); - kdb_printf(" Any command's output may be filtered through an "); - kdb_printf("emulated 'pipe'.\n"); - kdb_printf(" 'grep' is just a key word.\n"); - kdb_printf(" The pattern may include a very limited set of " - "metacharacters:\n"); - kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n"); - kdb_printf(" And if there are spaces in the pattern, you may " - "quote it:\n"); - kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\"" - " or \"^pat tern$\"\n"); - return 0; -} - -/* - * kdb_register_repeat - This function is used to register a kernel - * debugger command. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * repeat Does the command auto repeat on enter? - * Returns: - * zero for success, one if a duplicate command. - */ -#define kdb_command_extend 50 /* arbitrary */ -int kdb_register_repeat(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_repeat_t repeat) -{ - int i; - kdbtab_t *kp; - - /* - * Brute force method to determine duplicates - */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { - kdb_printf("Duplicate kdb command registered: " - "%s, func %p help %s\n", cmd, func, help); - return 1; - } - } - - /* - * Insert command into first available location in table - */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name == NULL) - break; - } - - if (i >= kdb_max_commands) { - kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX + - kdb_command_extend) * sizeof(*new), GFP_KDB); - if (!new) { - kdb_printf("Could not allocate new kdb_command " - "table\n"); - return 1; - } - if (kdb_commands) { - memcpy(new, kdb_commands, - (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); - kfree(kdb_commands); - } - memset(new + kdb_max_commands, 0, - kdb_command_extend * sizeof(*new)); - kdb_commands = new; - kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; - kdb_max_commands += kdb_command_extend; - } - - kp->cmd_name = cmd; - kp->cmd_func = func; - kp->cmd_usage = usage; - kp->cmd_help = help; - kp->cmd_flags = 0; - kp->cmd_minlen = minlen; - kp->cmd_repeat = repeat; - - return 0; -} -EXPORT_SYMBOL_GPL(kdb_register_repeat); - - -/* - * kdb_register - Compatibility register function for commands that do - * not need to specify a repeat state. Equivalent to - * kdb_register_repeat with KDB_REPEAT_NONE. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * Returns: - * zero for success, one if a duplicate command. - */ -int kdb_register(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen) -{ - return kdb_register_repeat(cmd, func, usage, help, minlen, - KDB_REPEAT_NONE); -} -EXPORT_SYMBOL_GPL(kdb_register); - -/* - * kdb_unregister - This function is used to unregister a kernel - * debugger command. It is generally called when a module which - * implements kdb commands is unloaded. - * Inputs: - * cmd Command name - * Returns: - * zero for success, one command not registered. - */ -int kdb_unregister(char *cmd) -{ - int i; - kdbtab_t *kp; - - /* - * find the command. - */ - for_each_kdbcmd(kp, i) { - if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { - kp->cmd_name = NULL; - return 0; - } - } - - /* Couldn't find it. */ - return 1; -} -EXPORT_SYMBOL_GPL(kdb_unregister); - -/* Initialize the kdb command table. */ -static void __init kdb_inittab(void) -{ - int i; - kdbtab_t *kp; - - for_each_kdbcmd(kp, i) - kp->cmd_name = NULL; - - kdb_register_repeat("md", kdb_md, "", - "Display Memory Contents, also mdWcN, e.g. md8c1", 1, - KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdr", kdb_md, " ", - "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mdp", kdb_md, " ", - "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mds", kdb_md, "", - "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("mm", kdb_mm, " ", - "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("go", kdb_go, "[]", - "Continue Execution", 1, KDB_REPEAT_NONE); - kdb_register_repeat("rd", kdb_rd, "", - "Display Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("rm", kdb_rm, " ", - "Modify Registers", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ef", kdb_ef, "", - "Display exception frame", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bt", kdb_bt, "[]", - "Stack traceback", 1, KDB_REPEAT_NONE); - kdb_register_repeat("btp", kdb_bt, "", - "Display stack for process ", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", - "Display stack all processes", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btc", kdb_bt, "", - "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("btt", kdb_bt, "", - "Backtrace process given its struct task address", 0, - KDB_REPEAT_NONE); - kdb_register_repeat("ll", kdb_ll, " ", - "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("env", kdb_env, "", - "Show environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("set", kdb_set, "", - "Set environment variables", 0, KDB_REPEAT_NONE); - kdb_register_repeat("help", kdb_help, "", - "Display Help Message", 1, KDB_REPEAT_NONE); - kdb_register_repeat("?", kdb_help, "", - "Display Help Message", 0, KDB_REPEAT_NONE); - kdb_register_repeat("cpu", kdb_cpu, "", - "Switch to new cpu", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kgdb", kdb_kgdb, "", - "Enter kgdb mode", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ps", kdb_ps, "[|A]", - "Display active task list", 0, KDB_REPEAT_NONE); - kdb_register_repeat("pid", kdb_pid, "", - "Switch to another task", 0, KDB_REPEAT_NONE); - kdb_register_repeat("reboot", kdb_reboot, "", - "Reboot the machine immediately", 0, KDB_REPEAT_NONE); -#if defined(CONFIG_MODULES) - kdb_register_repeat("lsmod", kdb_lsmod, "", - "List loaded kernel modules", 0, KDB_REPEAT_NONE); -#endif -#if defined(CONFIG_MAGIC_SYSRQ) - kdb_register_repeat("sr", kdb_sr, "", - "Magic SysRq key", 0, KDB_REPEAT_NONE); -#endif -#if defined(CONFIG_PRINTK) - kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", - "Display syslog buffer", 0, KDB_REPEAT_NONE); -#endif - kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", - "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); - kdb_register_repeat("kill", kdb_kill, "<-signal> ", - "Send a signal to a process", 0, KDB_REPEAT_NONE); - kdb_register_repeat("summary", kdb_summary, "", - "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, " [] []", - "Display per_cpu variables", 3, KDB_REPEAT_NONE); - kdb_register_repeat("grephelp", kdb_grep_help, "", - "Display help on | grep", 0, KDB_REPEAT_NONE); -} - -/* Execute any commands defined in kdb_cmds. */ -static void __init kdb_cmd_init(void) -{ - int i, diag; - for (i = 0; kdb_cmds[i]; ++i) { - diag = kdb_parse(kdb_cmds[i]); - if (diag) - kdb_printf("kdb command %s failed, kdb diag %d\n", - kdb_cmds[i], diag); - } - if (defcmd_in_progress) { - kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n"); - kdb_parse("endefcmd"); - } -} - -/* Initialize kdb_printf, breakpoint tables and kdb state */ -void __init kdb_init(int lvl) -{ - static int kdb_init_lvl = KDB_NOT_INITIALIZED; - int i; - - if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl) - return; - for (i = kdb_init_lvl; i < lvl; i++) { - switch (i) { - case KDB_NOT_INITIALIZED: - kdb_inittab(); /* Initialize Command Table */ - kdb_initbptab(); /* Initialize Breakpoints */ - break; - case KDB_INIT_EARLY: - kdb_cmd_init(); /* Build kdb_cmds tables */ - break; - } - } - kdb_init_lvl = lvl; -} -/* - * Kernel Debugger Architecture Independent Support Functions - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. - * 03/02/13 added new 2.5 kallsyms - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "kdb_private.h" - -/* - * kdbgetsymval - Return the address of the given symbol. - * - * Parameters: - * symname Character string containing symbol name - * symtab Structure to receive results - * Returns: - * 0 Symbol not found, symtab zero filled - * 1 Symbol mapped to module/symbol/section, data in symtab - */ -int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) -{ - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, - symtab); - memset(symtab, 0, sizeof(*symtab)); - symtab->sym_start = kallsyms_lookup_name(symname); - if (symtab->sym_start) { - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: returns 1, " - "symtab->sym_start=0x%lx\n", - symtab->sym_start); - return 1; - } - if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: returns 0\n"); - return 0; -} -EXPORT_SYMBOL(kdbgetsymval); - -static char *kdb_name_table[100]; /* arbitrary size */ - -/* - * kdbnearsym - Return the name of the symbol with the nearest address - * less than 'addr'. - * - * Parameters: - * addr Address to check for symbol near - * symtab Structure to receive results - * Returns: - * 0 No sections contain this address, symtab zero filled - * 1 Address mapped to module/symbol/section, data in symtab - * Remarks: - * 2.6 kallsyms has a "feature" where it unpacks the name into a - * string. If that string is reused before the caller expects it - * then the caller sees its string change without warning. To - * avoid cluttering up the main kdb code with lots of kdb_strdup, - * tests and kfree calls, kdbnearsym maintains an LRU list of the - * last few unique strings. The list is sized large enough to - * hold active strings, no kdb caller of kdbnearsym makes more - * than ~20 later calls before using a saved value. - */ -int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) -{ - int ret = 0; - unsigned long symbolsize = 0; - unsigned long offset = 0; -#define knt1_size 128 /* must be >= kallsyms table size */ - char *knt1 = NULL; - - if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); - memset(symtab, 0, sizeof(*symtab)); - - if (addr < 4096) - goto out; - knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); - if (!knt1) { - kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n", - addr); - goto out; - } - symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, - (char **)(&symtab->mod_name), knt1); - if (offset > 8*1024*1024) { - symtab->sym_name = NULL; - addr = offset = symbolsize = 0; - } - symtab->sym_start = addr - offset; - symtab->sym_end = symtab->sym_start + symbolsize; - ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0'; - - if (ret) { - int i; - /* Another 2.6 kallsyms "feature". Sometimes the sym_name is - * set but the buffer passed into kallsyms_lookup is not used, - * so it contains garbage. The caller has to work out which - * buffer needs to be saved. - * - * What was Rusty smoking when he wrote that code? - */ - if (symtab->sym_name != knt1) { - strncpy(knt1, symtab->sym_name, knt1_size); - knt1[knt1_size-1] = '\0'; - } - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i] && - strcmp(kdb_name_table[i], knt1) == 0) - break; - } - if (i >= ARRAY_SIZE(kdb_name_table)) { - debug_kfree(kdb_name_table[0]); - memcpy(kdb_name_table, kdb_name_table+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-1)); - } else { - debug_kfree(knt1); - knt1 = kdb_name_table[i]; - memcpy(kdb_name_table+i, kdb_name_table+i+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-i-1)); - } - i = ARRAY_SIZE(kdb_name_table) - 1; - kdb_name_table[i] = knt1; - symtab->sym_name = kdb_name_table[i]; - knt1 = NULL; - } - - if (symtab->mod_name == NULL) - symtab->mod_name = "kernel"; - if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " - "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, - symtab->sym_start, symtab->mod_name, symtab->sym_name, - symtab->sym_name); - -out: - debug_kfree(knt1); - return ret; -} - -void kdbnearsym_cleanup(void) -{ - int i; - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i]) { - debug_kfree(kdb_name_table[i]); - kdb_name_table[i] = NULL; - } - } -} - -static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1]; - -/* - * kallsyms_symbol_complete - * - * Parameters: - * prefix_name prefix of a symbol name to lookup - * max_len maximum length that can be returned - * Returns: - * Number of symbols which match the given prefix. - * Notes: - * prefix_name is changed to contain the longest unique prefix that - * starts with this prefix (tab completion). - */ -int kallsyms_symbol_complete(char *prefix_name, int max_len) -{ - loff_t pos = 0; - int prefix_len = strlen(prefix_name), prev_len = 0; - int i, number = 0; - const char *name; - - while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strcpy(ks_namebuf, name); - /* Work out the longest name that matches the prefix */ - if (++number == 1) { - prev_len = min_t(int, max_len-1, - strlen(ks_namebuf)); - memcpy(ks_namebuf_prev, ks_namebuf, prev_len); - ks_namebuf_prev[prev_len] = '\0'; - continue; - } - for (i = 0; i < prev_len; i++) { - if (ks_namebuf[i] != ks_namebuf_prev[i]) { - prev_len = i; - ks_namebuf_prev[i] = '\0'; - break; - } - } - } - } - if (prev_len > prefix_len) - memcpy(prefix_name, ks_namebuf_prev, prev_len+1); - return number; -} - -/* - * kallsyms_symbol_next - * - * Parameters: - * prefix_name prefix of a symbol name to lookup - * flag 0 means search from the head, 1 means continue search. - * Returns: - * 1 if a symbol matches the given prefix. - * 0 if no string found - */ -int kallsyms_symbol_next(char *prefix_name, int flag) -{ - int prefix_len = strlen(prefix_name); - static loff_t pos; - const char *name; - - if (!flag) - pos = 0; - - while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strncpy(prefix_name, name, strlen(name)+1); - return 1; - } - } - return 0; -} - -/* - * kdb_symbol_print - Standard method for printing a symbol name and offset. - * Inputs: - * addr Address to be printed. - * symtab Address of symbol data, if NULL this routine does its - * own lookup. - * punc Punctuation for string, bit field. - * Remarks: - * The string and its punctuation is only printed if the address - * is inside the kernel, except that the value is always printed - * when requested. - */ -void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p, - unsigned int punc) -{ - kdb_symtab_t symtab, *symtab_p2; - if (symtab_p) { - symtab_p2 = (kdb_symtab_t *)symtab_p; - } else { - symtab_p2 = &symtab; - kdbnearsym(addr, symtab_p2); - } - if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE))) - return; - if (punc & KDB_SP_SPACEB) - kdb_printf(" "); - if (punc & KDB_SP_VALUE) - kdb_printf(kdb_machreg_fmt0, addr); - if (symtab_p2->sym_name) { - if (punc & KDB_SP_VALUE) - kdb_printf(" "); - if (punc & KDB_SP_PAREN) - kdb_printf("("); - if (strcmp(symtab_p2->mod_name, "kernel")) - kdb_printf("[%s]", symtab_p2->mod_name); - kdb_printf("%s", symtab_p2->sym_name); - if (addr != symtab_p2->sym_start) - kdb_printf("+0x%lx", addr - symtab_p2->sym_start); - if (punc & KDB_SP_SYMSIZE) - kdb_printf("/0x%lx", - symtab_p2->sym_end - symtab_p2->sym_start); - if (punc & KDB_SP_PAREN) - kdb_printf(")"); - } - if (punc & KDB_SP_SPACEA) - kdb_printf(" "); - if (punc & KDB_SP_NEWLINE) - kdb_printf("\n"); -} - -/* - * kdb_strdup - kdb equivalent of strdup, for disasm code. - * Inputs: - * str The string to duplicate. - * type Flags to kmalloc for the new string. - * Returns: - * Address of the new string, NULL if storage could not be allocated. - * Remarks: - * This is not in lib/string.c because it uses kmalloc which is not - * available when string.o is used in boot loaders. - */ -char *kdb_strdup(const char *str, gfp_t type) -{ - int n = strlen(str)+1; - char *s = kmalloc(n, type); - if (!s) - return NULL; - return strcpy(s, str); -} - -/* - * kdb_getarea_size - Read an area of data. The kdb equivalent of - * copy_from_user, with kdb messages for invalid addresses. - * Inputs: - * res Pointer to the area to receive the result. - * addr Address of the area to copy. - * size Size of the area. - * Returns: - * 0 for success, < 0 for error. - */ -int kdb_getarea_size(void *res, unsigned long addr, size_t size) -{ - int ret = probe_kernel_read((char *)res, (char *)addr, size); - if (ret) { - if (!KDB_STATE(SUPPRESS)) { - kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); - KDB_STATE_SET(SUPPRESS); - } - ret = KDB_BADADDR; - } else { - KDB_STATE_CLEAR(SUPPRESS); - } - return ret; -} - -/* - * kdb_putarea_size - Write an area of data. The kdb equivalent of - * copy_to_user, with kdb messages for invalid addresses. - * Inputs: - * addr Address of the area to write to. - * res Pointer to the area holding the data. - * size Size of the area. - * Returns: - * 0 for success, < 0 for error. - */ -int kdb_putarea_size(unsigned long addr, void *res, size_t size) -{ - int ret = probe_kernel_read((char *)addr, (char *)res, size); - if (ret) { - if (!KDB_STATE(SUPPRESS)) { - kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); - KDB_STATE_SET(SUPPRESS); - } - ret = KDB_BADADDR; - } else { - KDB_STATE_CLEAR(SUPPRESS); - } - return ret; -} - -/* - * kdb_getphys - Read data from a physical address. Validate the - * address is in range, use kmap_atomic() to get data - * similar to kdb_getarea() - but for phys addresses - * Inputs: - * res Pointer to the word to receive the result - * addr Physical address of the area to copy - * size Size of the area - * Returns: - * 0 for success, < 0 for error. - */ -static int kdb_getphys(void *res, unsigned long addr, size_t size) -{ - unsigned long pfn; - void *vaddr; - struct page *page; - - pfn = (addr >> PAGE_SHIFT); - if (!pfn_valid(pfn)) - return 1; - page = pfn_to_page(pfn); - vaddr = kmap_atomic(page, KM_KDB); - memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr, KM_KDB); - - return 0; -} - -/* - * kdb_getphysword - * Inputs: - * word Pointer to the word to receive the result. - * addr Address of the area to copy. - * size Size of the area. - * Returns: - * 0 for success, < 0 for error. - */ -int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) -{ - int diag; - __u8 w1; - __u16 w2; - __u32 w4; - __u64 w8; - *word = 0; /* Default value if addr or size is invalid */ - - switch (size) { - case 1: - diag = kdb_getphys(&w1, addr, sizeof(w1)); - if (!diag) - *word = w1; - break; - case 2: - diag = kdb_getphys(&w2, addr, sizeof(w2)); - if (!diag) - *word = w2; - break; - case 4: - diag = kdb_getphys(&w4, addr, sizeof(w4)); - if (!diag) - *word = w4; - break; - case 8: - if (size <= sizeof(*word)) { - diag = kdb_getphys(&w8, addr, sizeof(w8)); - if (!diag) - *word = w8; - break; - } - /* drop through */ - default: - diag = KDB_BADWIDTH; - kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); - } - return diag; -} - -/* - * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats - * data as numbers. - * Inputs: - * word Pointer to the word to receive the result. - * addr Address of the area to copy. - * size Size of the area. - * Returns: - * 0 for success, < 0 for error. - */ -int kdb_getword(unsigned long *word, unsigned long addr, size_t size) -{ - int diag; - __u8 w1; - __u16 w2; - __u32 w4; - __u64 w8; - *word = 0; /* Default value if addr or size is invalid */ - switch (size) { - case 1: - diag = kdb_getarea(w1, addr); - if (!diag) - *word = w1; - break; - case 2: - diag = kdb_getarea(w2, addr); - if (!diag) - *word = w2; - break; - case 4: - diag = kdb_getarea(w4, addr); - if (!diag) - *word = w4; - break; - case 8: - if (size <= sizeof(*word)) { - diag = kdb_getarea(w8, addr); - if (!diag) - *word = w8; - break; - } - /* drop through */ - default: - diag = KDB_BADWIDTH; - kdb_printf("kdb_getword: bad width %ld\n", (long) size); - } - return diag; -} - -/* - * kdb_putword - Write a binary value. Unlike kdb_putarea, this - * treats data as numbers. - * Inputs: - * addr Address of the area to write to.. - * word The value to set. - * size Size of the area. - * Returns: - * 0 for success, < 0 for error. - */ -int kdb_putword(unsigned long addr, unsigned long word, size_t size) -{ - int diag; - __u8 w1; - __u16 w2; - __u32 w4; - __u64 w8; - switch (size) { - case 1: - w1 = word; - diag = kdb_putarea(addr, w1); - break; - case 2: - w2 = word; - diag = kdb_putarea(addr, w2); - break; - case 4: - w4 = word; - diag = kdb_putarea(addr, w4); - break; - case 8: - if (size <= sizeof(word)) { - w8 = word; - diag = kdb_putarea(addr, w8); - break; - } - /* drop through */ - default: - diag = KDB_BADWIDTH; - kdb_printf("kdb_putword: bad width %ld\n", (long) size); - } - return diag; -} - -/* - * kdb_task_state_string - Convert a string containing any of the - * letters DRSTCZEUIMA to a mask for the process state field and - * return the value. If no argument is supplied, return the mask - * that corresponds to environment variable PS, DRSTCZEU by - * default. - * Inputs: - * s String to convert - * Returns: - * Mask for process state. - * Notes: - * The mask folds data from several sources into a single long value, so - * be careful not to overlap the bits. TASK_* bits are in the LSB, - * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there - * is no overlap between TASK_* and EXIT_* but that may not always be - * true, so EXIT_* bits are shifted left 16 bits before being stored in - * the mask. - */ - -/* unrunnable is < 0 */ -#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1)) -#define RUNNING (1UL << (8*sizeof(unsigned long) - 2)) -#define IDLE (1UL << (8*sizeof(unsigned long) - 3)) -#define DAEMON (1UL << (8*sizeof(unsigned long) - 4)) - -unsigned long kdb_task_state_string(const char *s) -{ - long res = 0; - if (!s) { - s = kdbgetenv("PS"); - if (!s) - s = "DRSTCZEU"; /* default value for ps */ - } - while (*s) { - switch (*s) { - case 'D': - res |= TASK_UNINTERRUPTIBLE; - break; - case 'R': - res |= RUNNING; - break; - case 'S': - res |= TASK_INTERRUPTIBLE; - break; - case 'T': - res |= TASK_STOPPED; - break; - case 'C': - res |= TASK_TRACED; - break; - case 'Z': - res |= EXIT_ZOMBIE << 16; - break; - case 'E': - res |= EXIT_DEAD << 16; - break; - case 'U': - res |= UNRUNNABLE; - break; - case 'I': - res |= IDLE; - break; - case 'M': - res |= DAEMON; - break; - case 'A': - res = ~0UL; - break; - default: - kdb_printf("%s: unknown flag '%c' ignored\n", - __func__, *s); - break; - } - ++s; - } - return res; -} - -/* - * kdb_task_state_char - Return the character that represents the task state. - * Inputs: - * p struct task for the process - * Returns: - * One character to represent the task state. - */ -char kdb_task_state_char (const struct task_struct *p) -{ - int cpu; - char state; - unsigned long tmp; - - if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) - return 'E'; - - cpu = kdb_process_cpu(p); - state = (p->state == 0) ? 'R' : - (p->state < 0) ? 'U' : - (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : - (p->state & TASK_STOPPED) ? 'T' : - (p->state & TASK_TRACED) ? 'C' : - (p->exit_state & EXIT_ZOMBIE) ? 'Z' : - (p->exit_state & EXIT_DEAD) ? 'E' : - (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; - if (is_idle_task(p)) { - /* Idle task. Is it really idle, apart from the kdb - * interrupt? */ - if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { - if (cpu != kdb_initial_cpu) - state = 'I'; /* idle task */ - } - } else if (!p->mm && state == 'S') { - state = 'M'; /* sleeping system daemon */ - } - return state; -} - -/* - * kdb_task_state - Return true if a process has the desired state - * given by the mask. - * Inputs: - * p struct task for the process - * mask mask from kdb_task_state_string to select processes - * Returns: - * True if the process matches at least one criteria defined by the mask. - */ -unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) -{ - char state[] = { kdb_task_state_char(p), '\0' }; - return (mask & kdb_task_state_string(state)) != 0; -} - -/* - * kdb_print_nameval - Print a name and its value, converting the - * value to a symbol lookup if possible. - * Inputs: - * name field name to print - * val value of field - */ -void kdb_print_nameval(const char *name, unsigned long val) -{ - kdb_symtab_t symtab; - kdb_printf(" %-11.11s ", name); - if (kdbnearsym(val, &symtab)) - kdb_symbol_print(val, &symtab, - KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE); - else - kdb_printf("0x%lx\n", val); -} - -/* Last ditch allocator for debugging, so we can still debug even when - * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned - * for space usage, not for speed. One smallish memory pool, the free - * chain is always in ascending address order to allow coalescing, - * allocations are done in brute force best fit. - */ - -struct debug_alloc_header { - u32 next; /* offset of next header from start of pool */ - u32 size; - void *caller; -}; - -/* The memory returned by this allocator must be aligned, which means - * so must the header size. Do not assume that sizeof(struct - * debug_alloc_header) is a multiple of the alignment, explicitly - * calculate the overhead of this header, including the alignment. - * The rest of this code must not use sizeof() on any header or - * pointer to a header. - */ -#define dah_align 8 -#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align) - -static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */ -static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned; -static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max; - -/* Locking is awkward. The debug code is called from all contexts, - * including non maskable interrupts. A normal spinlock is not safe - * in NMI context. Try to get the debug allocator lock, if it cannot - * be obtained after a second then give up. If the lock could not be - * previously obtained on this cpu then only try once. - * - * sparse has no annotation for "this function _sometimes_ acquires a - * lock", so fudge the acquire/release notation. - */ -static DEFINE_SPINLOCK(dap_lock); -static int get_dap_lock(void) - __acquires(dap_lock) -{ - static int dap_locked = -1; - int count; - if (dap_locked == smp_processor_id()) - count = 1; - else - count = 1000; - while (1) { - if (spin_trylock(&dap_lock)) { - dap_locked = -1; - return 1; - } - if (!count--) - break; - udelay(1000); - } - dap_locked = smp_processor_id(); - __acquire(dap_lock); - return 0; -} - -void *debug_kmalloc(size_t size, gfp_t flags) -{ - unsigned int rem, h_offset; - struct debug_alloc_header *best, *bestprev, *prev, *h; - void *p = NULL; - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return NULL; - } - h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first_call) { - h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead; - dah_first_call = 0; - } - size = ALIGN(size, dah_align); - prev = best = bestprev = NULL; - while (1) { - if (h->size >= size && (!best || h->size < best->size)) { - best = h; - bestprev = prev; - if (h->size == size) - break; - } - if (!h->next) - break; - prev = h; - h = (struct debug_alloc_header *)(debug_alloc_pool + h->next); - } - if (!best) - goto out; - rem = best->size - size; - /* The pool must always contain at least one header */ - if (best->next == 0 && bestprev == NULL && rem < dah_overhead) - goto out; - if (rem >= dah_overhead) { - best->size = size; - h_offset = ((char *)best - debug_alloc_pool) + - dah_overhead + best->size; - h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset); - h->size = rem - dah_overhead; - h->next = best->next; - } else - h_offset = best->next; - best->caller = __builtin_return_address(0); - dah_used += best->size; - dah_used_max = max(dah_used, dah_used_max); - if (bestprev) - bestprev->next = h_offset; - else - dah_first = h_offset; - p = (char *)best + dah_overhead; - memset(p, POISON_INUSE, best->size - 1); - *((char *)p + best->size - 1) = POISON_END; -out: - spin_unlock(&dap_lock); - return p; -} - -void debug_kfree(void *p) -{ - struct debug_alloc_header *h; - unsigned int h_offset; - if (!p) - return; - if ((char *)p < debug_alloc_pool || - (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) { - kfree(p); - return; - } - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; /* memory leak, cannot be helped */ - } - h = (struct debug_alloc_header *)((char *)p - dah_overhead); - memset(p, POISON_FREE, h->size - 1); - *((char *)p + h->size - 1) = POISON_END; - h->caller = NULL; - dah_used -= h->size; - h_offset = (char *)h - debug_alloc_pool; - if (h_offset < dah_first) { - h->next = dah_first; - dah_first = h_offset; - } else { - struct debug_alloc_header *prev; - unsigned int prev_offset; - prev = (struct debug_alloc_header *)(debug_alloc_pool + - dah_first); - while (1) { - if (!prev->next || prev->next > h_offset) - break; - prev = (struct debug_alloc_header *) - (debug_alloc_pool + prev->next); - } - prev_offset = (char *)prev - debug_alloc_pool; - if (prev_offset + dah_overhead + prev->size == h_offset) { - prev->size += dah_overhead + h->size; - memset(h, POISON_FREE, dah_overhead - 1); - *((char *)h + dah_overhead - 1) = POISON_END; - h = prev; - h_offset = prev_offset; - } else { - h->next = prev->next; - prev->next = h_offset; - } - } - if (h_offset + dah_overhead + h->size == h->next) { - struct debug_alloc_header *next; - next = (struct debug_alloc_header *) - (debug_alloc_pool + h->next); - h->size += dah_overhead + next->size; - h->next = next->next; - memset(next, POISON_FREE, dah_overhead - 1); - *((char *)next + dah_overhead - 1) = POISON_END; - } - spin_unlock(&dap_lock); -} - -void debug_kusage(void) -{ - struct debug_alloc_header *h_free, *h_used; -#ifdef CONFIG_IA64 - /* FIXME: using dah for ia64 unwind always results in a memory leak. - * Fix that memory leak first, then set debug_kusage_one_time = 1 for - * all architectures. - */ - static int debug_kusage_one_time; -#else - static int debug_kusage_one_time = 1; -#endif - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; - } - h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first == 0 && - (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead || - dah_first_call)) - goto out; - if (!debug_kusage_one_time) - goto out; - debug_kusage_one_time = 0; - kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n", - __func__, dah_first); - if (dah_first) { - h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_printf("%s: h_used %p size %d\n", __func__, h_used, - h_used->size); - } - do { - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - kdb_printf("%s: h_used %p size %d caller %p\n", - __func__, h_used, h_used->size, h_used->caller); - h_free = (struct debug_alloc_header *) - (debug_alloc_pool + h_free->next); - } while (h_free->next); - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - if ((char *)h_used - debug_alloc_pool != - sizeof(debug_alloc_pool_aligned)) - kdb_printf("%s: h_used %p size %d caller %p\n", - __func__, h_used, h_used->size, h_used->caller); -out: - spin_unlock(&dap_lock); -} - -/* Maintain a small stack of kdb_flags to allow recursion without disturbing - * the global kdb state. - */ - -static int kdb_flags_stack[4], kdb_flags_index; - -void kdb_save_flags(void) -{ - BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack)); - kdb_flags_stack[kdb_flags_index++] = kdb_flags; -} - -void kdb_restore_flags(void) -{ - BUG_ON(kdb_flags_index <= 0); - kdb_flags = kdb_flags_stack[--kdb_flags_index]; -} -/* delayacct.c - per-task delay accounting - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include -#include - -int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -EXPORT_SYMBOL_GPL(delayacct_on); -struct kmem_cache *delayacct_cache; - -static int __init delayacct_setup_disable(char *str) -{ - delayacct_on = 0; - return 1; -} -__setup("nodelayacct", delayacct_setup_disable); - -void delayacct_init(void) -{ - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); - delayacct_tsk_init(&init_task); -} - -void __delayacct_tsk_init(struct task_struct *tsk) -{ - tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); - if (tsk->delays) - spin_lock_init(&tsk->delays->lock); -} - -/* - * Start accounting for a delay statistic using - * its starting timestamp (@start) - */ - -static inline void delayacct_start(struct timespec *start) -{ - do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, - u64 *total, u32 *count) -{ - struct timespec ts; - s64 ns; - unsigned long flags; - - do_posix_clock_monotonic_gettime(end); - ts = timespec_sub(*end, *start); - ns = timespec_to_ns(&ts); - if (ns < 0) - return; - - spin_lock_irqsave(¤t->delays->lock, flags); - *total += ns; - (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); -} - -void __delayacct_blkio_start(void) -{ - delayacct_start(¤t->delays->blkio_start); -} - -void __delayacct_blkio_end(void) -{ - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_end, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); -} - -int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) -{ - s64 tmp; - unsigned long t1; - unsigned long long t2, t3; - unsigned long flags; - struct timespec ts; - - /* Though tsk->delays accessed later, early exit avoids - * unnecessary returning of other data - */ - if (!tsk->delays) - goto done; - - tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); - tmp += timespec_to_ns(&ts); - d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; - - tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); - tmp += timespec_to_ns(&ts); - d->cpu_scaled_run_real_total = - (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; - - /* - * No locking available for sched_info (and too expensive to add one) - * Mitigate by taking snapshot of values - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; - t3 = tsk->se.sum_exec_runtime; - - d->cpu_count += t1; - - tmp = (s64)d->cpu_delay_total + t2; - d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - - tmp = (s64)d->cpu_run_virtual_total + t3; - d->cpu_run_virtual_total = - (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; - - /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - - spin_lock_irqsave(&tsk->delays->lock, flags); - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - spin_unlock_irqrestore(&tsk->delays->lock, flags); - -done: - return 0; -} - -__u64 __delayacct_blkio_ticks(struct task_struct *tsk) -{ - __u64 ret; - unsigned long flags; - - spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); - spin_unlock_irqrestore(&tsk->delays->lock, flags); - return ret; -} - -void __delayacct_freepages_start(void) -{ - delayacct_start(¤t->delays->freepages_start); -} - -void __delayacct_freepages_end(void) -{ - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_end, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); -} - -/* - * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. - * - * Written by Hennus Bergman, 1992. - * - * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. - * In the previous version the reported device could end up being wrong, - * if a device requested a DMA channel that was already in use. - * [It also happened to remove the sizeof(char *) == sizeof(int) - * assumption introduced because of those /proc/dma patches. -- Hennus] - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* A note on resource allocation: - * - * All drivers needing DMA channels, should allocate and release them - * through the public routines `request_dma()' and `free_dma()'. - * - * In order to avoid problems, all processes should allocate resources in - * the same sequence and release them in the reverse order. - * - * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. - * When releasing them, first release the DMA, then release the IRQ. - * If you don't, you may cause allocation requests to fail unnecessarily. - * This doesn't really matter now, but it will once we get real semaphores - * in the kernel. - */ - - -DEFINE_SPINLOCK(dma_spin_lock); - -/* - * If our port doesn't define this it has no PC like DMA - */ - -#ifdef MAX_DMA_CHANNELS - - -/* Channel n is busy iff dma_chan_busy[n].lock != 0. - * DMA0 used to be reserved for DRAM refresh, but apparently not any more... - * DMA4 is reserved for cascading. - */ - -struct dma_chan { - int lock; - const char *device_id; -}; - -static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { - [4] = { 1, "cascade" }, -}; - - -/** - * request_dma - request and reserve a system DMA channel - * @dmanr: DMA channel number - * @device_id: reserving device ID string, used in /proc/dma - */ -int request_dma(unsigned int dmanr, const char * device_id) -{ - if (dmanr >= MAX_DMA_CHANNELS) - return -EINVAL; - - if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) - return -EBUSY; - - dma_chan_busy[dmanr].device_id = device_id; - - /* old flag was 0, now contains 1 to indicate busy */ - return 0; -} /* request_dma */ - -/** - * free_dma - free a reserved system DMA channel - * @dmanr: DMA channel number - */ -void free_dma(unsigned int dmanr) -{ - if (dmanr >= MAX_DMA_CHANNELS) { - printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); - return; - } - - if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { - printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); - return; - } - -} /* free_dma */ - -#else - -int request_dma(unsigned int dmanr, const char *device_id) -{ - return -EINVAL; -} - -void free_dma(unsigned int dmanr) -{ -} - -#endif - -#ifdef CONFIG_PROC_FS - -#ifdef MAX_DMA_CHANNELS -static int proc_dma_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { - if (dma_chan_busy[i].lock) { - seq_printf(m, "%2d: %s\n", i, - dma_chan_busy[i].device_id); - } - } - return 0; -} -#else -static int proc_dma_show(struct seq_file *m, void *v) -{ - seq_puts(m, "No DMA\n"); - return 0; -} -#endif /* MAX_DMA_CHANNELS */ - -static int proc_dma_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_dma_show, NULL); -} - -static const struct file_operations proc_dma_operations = { - .open = proc_dma_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_dma_init(void) -{ - proc_create("dma", 0, NULL, &proc_dma_operations); - return 0; -} - -__initcall(proc_dma_init); -#endif - -EXPORT_SYMBOL(request_dma); -EXPORT_SYMBOL(free_dma); -EXPORT_SYMBOL(dma_spin_lock); -#include -#include -#include - -#include - - -Elf_Half __weak elf_core_extra_phdrs(void) -{ - return 0; -} - -int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, - unsigned long limit) -{ - return 1; -} - -int __weak elf_core_write_extra_data(struct file *file, size_t *size, - unsigned long limit) -{ - return 1; -} - -size_t __weak elf_core_extra_data_size(void) -{ - return 0; -} -/* - * Performance events callchain code, extracted from core.c: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include "internal.h" - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -static struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} -/* - * Performance events core code: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" - -#include - -struct remote_function_call { - struct task_struct *p; - int (*func)(void *info); - void *info; - int ret; -}; - -static void remote_function(void *data) -{ - struct remote_function_call *tfc = data; - struct task_struct *p = tfc->p; - - if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) - return; - } - - tfc->ret = tfc->func(tfc->info); -} - -/** - * task_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away - */ -static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = p, - .func = func, - .info = info, - .ret = -ESRCH, /* No such (running) process */ - }; - - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); - - return data.ret; -} - -/** - * cpu_function_call - call a function on the cpu - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func on the remote cpu. - * - * returns: @func return value or -ENXIO when the cpu is offline - */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = NULL, - .func = func, - .info = info, - .ret = -ENXIO, /* No such CPU */ - }; - - smp_call_function_single(cpu, remote_function, &data, 1); - - return data.ret; -} - -#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ - PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) - -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - -/* - * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu - */ -struct jump_label_key_deferred perf_sched_events __read_mostly; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); - -static atomic_t nr_mmap_events __read_mostly; -static atomic_t nr_comm_events __read_mostly; -static atomic_t nr_task_events __read_mostly; - -static LIST_HEAD(pmus); -static DEFINE_MUTEX(pmus_lock); -static struct srcu_struct pmus_srcu; - -/* - * perf event paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_event_paranoid __read_mostly = 1; - -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ - -/* - * max perf event sample rate - */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); - -int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - - max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); - - return 0; -} - -static atomic64_t perf_event_id; - -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void update_context_time(struct perf_event_context *ctx); -static u64 perf_event_time(struct perf_event *event); - -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - -void __weak perf_event_print_debug(void) { } - -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - -static inline u64 perf_clock(void) -{ - return local_clock(); -} - -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - -#ifdef CONFIG_CGROUP_PERF - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); -} - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - return !event->cgrp || event->cgrp == cpuctx->cgrp; -} - -static inline void perf_get_cgroup(struct perf_event *event) -{ - css_get(&event->cgrp->css); -} - -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{ - perf_put_cgroup(event); - event->cgrp = NULL; -} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return event->cgrp != NULL; -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - struct perf_cgroup_info *t; - - t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; -} - -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) -{ - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); - - info = this_cpu_ptr(cgrp->info); - - info->time += now - info->timestamp; - info->timestamp = now; -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ - struct perf_cgroup *cgrp; - - /* - * ensure we access cgroup data only when needed and - * when we know the cgroup is pinned (css_get) - */ - if (!is_cgroup_event(event)) - return; - - cgrp = perf_cgroup_from_task(current); - /* - * Do not update time when cgroup is not active - */ - if (cgrp == event->cgrp) - __update_cgrp_time(event->cgrp); -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ - struct perf_cgroup *cgrp; - struct perf_cgroup_info *info; - - /* - * ctx->lock held by caller - * ensure we do not access cgroup data - * unless we have the cgroup pinned (css_get) - */ - if (!task || !ctx->nr_cgroups) - return; - - cgrp = perf_cgroup_from_task(task); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; -} - -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - -/* - * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next - */ -void perf_cgroup_switch(struct task_struct *task, int mode) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. - */ - local_irq_save(flags); - - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } - - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around - */ - cpuctx->cgrp = perf_cgroup_from_task(task); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - /* - * we come here when we know perf_cgroup_events > 0 - */ - cgrp1 = perf_cgroup_from_task(task); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next); - - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - /* - * we come here when we know perf_cgroup_events > 0 - */ - cgrp1 = perf_cgroup_from_task(task); - - /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); - - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); -} - -static inline int perf_cgroup_connect(int fd, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - struct perf_cgroup *cgrp; - struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; - - file = fget_light(fd, &fput_needed); - if (!file) - return -EBADF; - - css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } - - cgrp = container_of(css, struct perf_cgroup, css); - event->cgrp = cgrp; - - /* must be done before we fput() the file */ - perf_get_cgroup(event); - - /* - * all events in a group must monitor - * the same cgroup because a task belongs - * to only one perf cgroup at a time - */ - if (group_leader && group_leader->cgrp != cgrp) { - perf_detach_cgroup(event); - ret = -EINVAL; - } -out: - fput_light(file, fput_needed); - return ret; -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} -#else /* !CONFIG_CGROUP_PERF */ - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - return true; -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return 0; -} - -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ -} - -static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - return -EINVAL; -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) -{ -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - return 0; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} -#endif - -void perf_pmu_disable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!(*count)++) - pmu->pmu_disable(pmu); -} - -void perf_pmu_enable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!--(*count)) - pmu->pmu_enable(pmu); -} - -static DEFINE_PER_CPU(struct list_head, rotation_list); - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_pmu_rotate_start(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); - - WARN_ON(!irqs_disabled()); - - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); -} - -static void get_ctx(struct perf_event_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void put_ctx(struct perf_event_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - kfree_rcu(ctx, rcu_head); - } -} - -static void unclone_ctx(struct perf_event_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - -/* - * If we inherit events we want to return the parent event id - * to userspace. - */ -static u64 primary_event_id(struct perf_event *event) -{ - u64 id = event->id; - - if (event->parent) - id = event->parent->id; - - return id; -} - -/* - * Get the perf_event_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) -{ - struct perf_event_context *ctx; - - rcu_read_lock(); -retry: - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_event_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) -{ - struct perf_event_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_event_context *ctx) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -static u64 perf_event_time(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - if (is_cgroup_event(event)) - return perf_cgroup_event_time(event); - - return ctx ? ctx->time : 0; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) -{ - if (event->attr.pinned) - return &ctx->pinned_groups; - else - return &ctx->flexible_groups; -} - -/* - * Add a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_event(struct perf_event *event, struct perf_event_context *ctx) -{ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - event->attach_state |= PERF_ATTACH_CONTEXT; - - /* - * If we're a stand alone event or group leader, we go to the context - * list, group events are kept attached to the group so that - * perf_group_detach can, at all times, locate all siblings. - */ - if (event->group_leader == event) { - struct list_head *list; - - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); - } - - if (is_cgroup_event(event)) - ctx->nr_cgroups++; - - list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); - ctx->nr_events++; - if (event->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - event->read_size = size; -} - -static void perf_event__header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - perf_event__read_size(event); - - if (sample_type & PERF_SAMPLE_IP) - size += sizeof(data->ip); - - if (sample_type & PERF_SAMPLE_ADDR) - size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_PERIOD) - size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - size += event->read_size; - - event->header_size = size; -} - -static void perf_event__id_header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu_entry); - - event->id_header_size = size; -} - -static void perf_group_attach(struct perf_event *event) -{ - struct perf_event *group_leader = event->group_leader, *pos; - - /* - * We can have double attach due to group movement in perf_event_open. - */ - if (event->attach_state & PERF_ATTACH_GROUP) - return; - - event->attach_state |= PERF_ATTACH_GROUP; - - if (group_leader == event) - return; - - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - - perf_event__header_size(group_leader); - - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) - perf_event__header_size(pos); -} - -/* - * Remove a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_event(struct perf_event *event, struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx; - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_CONTEXT)) - return; - - event->attach_state &= ~PERF_ATTACH_CONTEXT; - - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - cpuctx = __get_cpu_context(ctx); - /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } - - ctx->nr_events--; - if (event->attr.inherit_stat) - ctx->nr_stat--; - - list_del_rcu(&event->event_entry); - - if (event->group_leader == event) - list_del_init(&event->group_entry); - - update_group_times(event); - - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; -} - -static void perf_group_detach(struct perf_event *event) -{ - struct perf_event *sibling, *tmp; - struct list_head *list = NULL; - - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_GROUP)) - return; - - event->attach_state &= ~PERF_ATTACH_GROUP; - - /* - * If this is a sibling, remove it from its group. - */ - if (event->group_leader != event) { - list_del_init(&event->group_entry); - event->group_leader->nr_siblings--; - goto out; - } - - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - - /* - * If this was a group event with sibling events then - * upgrade the siblings to singleton events by adding them - * to whatever list we are on. - */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); - sibling->group_leader = sibling; - - /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; - } - -out: - perf_event__header_size(event->group_leader); - - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) - perf_event__header_size(tmp); -} - -static inline int -event_filter_match(struct perf_event *event) -{ - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event); -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - u64 delta; - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return; - - event->state = PERF_EVENT_STATE_INACTIVE; - if (event->pending_disable) { - event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; - } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; - - if (!is_software_event(event)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (event->attr.freq && event->attr.sample_freq) - ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - int state = group_event->state; - - event_sched_out(group_event, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) - event_sched_out(event, cpuctx, ctx); - - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance event - * - * We disable the event on the hardware level first. After that we - * remove it from the context list. - */ -static int __perf_remove_from_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { - ctx->is_active = 0; - cpuctx->task_ctx = NULL; - } - raw_spin_unlock(&ctx->lock); - - return 0; -} - - -/* - * Remove the event from a task's (or a CPU's) list of events. - * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_event_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_remove_from_context(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - if (!task) { - /* - * Per cpu events are removed via an smp call and - * the removal is always successful. - */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_remove_from_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to remove the event, us - * holding the ctx->lock ensures the task won't get scheduled in. - */ - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Cross CPU call to disable a performance event - */ -static int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Disable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_event_for_each_child or perf_event_for_each because they - * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. - * When called from perf_pending_event it's OK because event->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_event_task_sched_out for this context. - */ -void perf_event_disable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_disable, event); - return; - } - -retry: - if (!task_function_call(task, __perf_event_disable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If the event is still active, we need to retry the cross-call. - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } - raw_spin_unlock_irq(&ctx->lock); -} -EXPORT_SYMBOL_GPL(perf_event_disable); - -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); - else - event->shadow_ctx_time = tstamp - ctx->timestamp; -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_event *event, int enable); - -static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; - - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); - - /* - * Unthrottle events, since we scheduled we might have missed several - * ticks already, also for a heavily scheduling task there is little - * guarantee it'll get a tick in a timely manner. - */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } - - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; - event->oncpu = -1; - return -EAGAIN; - } - - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - - if (!is_software_event(event)) - cpuctx->active_oncpu++; - ctx->nr_active++; - if (event->attr.freq && event->attr.sample_freq) - ctx->nr_freq++; - - if (event->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; - u64 now = ctx->time; - bool simulate = false; - - if (group_event->state == PERF_EVENT_STATE_OFF) - return 0; - - pmu->start_txn(pmu); - - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - return -EAGAIN; - } - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { - partial_group = event; - goto group_error; - } - } - - if (!pmu->commit_txn(pmu)) - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event == partial_group) - simulate = true; - - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } - } - event_sched_out(group_event, cpuctx, ctx); - - pmu->cancel_txn(pmu); - - return -EAGAIN; -} - -/* - * Work out whether we can put this event group on the CPU now. - */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software events can always go on. - */ - if (event->group_flags & PERF_GROUP_SOFTWARE) - return 1; - /* - * If an exclusive group is already on, no other hardware - * events can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * events on the CPU, it can't go on. - */ - if (event->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_event_to_ctx(struct perf_event *event, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - list_add_event(event, ctx); - perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; -} - -static void task_ctx_sched_out(struct perf_event_context *ctx); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - struct task_struct *task) -{ - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); - if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); -} - -/* - * Cross CPU call to install and enable a performance event - * - * Must be called with ctx->mutex held - */ -static int __perf_install_in_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(task_ctx); - - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); - raw_spin_lock(&ctx->lock); - task_ctx = ctx; - } - - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); - - add_event_to_ctx(event, ctx); - - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, task_ctx); - - return 0; -} - -/* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - */ -static void -perf_install_in_context(struct perf_event_context *ctx, - struct perf_event *event, - int cpu) -{ - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - event->ctx = ctx; - - if (!task) { - /* - * Per cpu events are installed via an smp call and - * the install is always successful. - */ - cpu_function_call(cpu, __perf_install_in_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_install_in_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - } -} - -/* - * Cross CPU call to enable a performance event - */ -static int __perf_event_enable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - if (WARN_ON_ONCE(!ctx->is_active)) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); - - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); - - __perf_event_mark_enabled(event); - - if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); - goto unlock; - } - - /* - * If the event is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Enable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_event_for_each_child or perf_event_for_each as described - * for perf_event_disable. - */ -void perf_event_enable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_enable, event); - return; - } - - raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto out; - - /* - * If the event is in error state, clear that first. - * That way, if we see the event in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (event->state == PERF_EVENT_STATE_ERROR) - event->state = PERF_EVENT_STATE_OFF; - -retry: - if (!ctx->is_active) { - __perf_event_mark_enabled(event); - goto out; - } - - raw_spin_unlock_irq(&ctx->lock); - - if (!task_function_call(task, __perf_event_enable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the event is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { - /* - * task could have been flipped by a concurrent - * perf_event_context_sched_out() - */ - task = ctx->task; - goto retry; - } - -out: - raw_spin_unlock_irq(&ctx->lock); -} -EXPORT_SYMBOL_GPL(perf_event_enable); - -int perf_event_refresh(struct perf_event *event, int refresh) -{ - /* - * not supported on inherited events - */ - if (event->attr.inherit || !is_sampling_event(event)) - return -EINVAL; - - atomic_add(refresh, &event->event_limit); - perf_event_enable(event); - - return 0; -} -EXPORT_SYMBOL_GPL(perf_event_refresh); - -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - struct perf_event *event; - int is_active = ctx->is_active; - - ctx->is_active &= ~event_type; - if (likely(!ctx->nr_events)) - return; - - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) - return; - - perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } - - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } - perf_pmu_enable(ctx->pmu); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_event_context *ctx1, - struct perf_event_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_event_sync_stat(struct perf_event *event, - struct perf_event *next_event) -{ - u64 value; - - if (!event->attr.inherit_stat) - return; - - /* - * Update the event value, we cannot use perf_event_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the event must be on the current CPU, therefore we - * don't need to use it. - */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: - event->pmu->read(event); - /* fall-through */ - - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the event - * values when we flip the contexts. - */ - value = local64_read(&next_event->count); - value = local64_xchg(&event->count, value); - local64_set(&next_event->count, value); - - swap(event->total_time_enabled, next_event->total_time_enabled); - swap(event->total_time_running, next_event->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_event_update_userpage(event); - perf_event_update_userpage(next_event); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_event_sync_stat(struct perf_event_context *ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event *event, *next_event; - - if (!ctx->nr_stat) - return; - - update_context_time(ctx); - - event = list_first_entry(&ctx->event_list, - struct perf_event, event_entry); - - next_event = list_first_entry(&next_ctx->event_list, - struct perf_event, event_entry); - - while (&event->event_entry != &ctx->event_list && - &next_event->event_entry != &next_ctx->event_list) { - - __perf_event_sync_stat(event, next_event); - - event = list_next_entry(event, event_entry); - next_event = list_next_entry(next_event, event_entry); - } -} - -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) -{ - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; - struct perf_event_context *next_ctx; - struct perf_event_context *parent; - struct perf_cpu_context *cpuctx; - int do_switch = 1; - - if (likely(!ctx)) - return; - - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - raw_spin_lock(&ctx->lock); - raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_event_sync_stat(ctx, next_ctx); - } - raw_spin_unlock(&next_ctx->lock); - raw_spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; - raw_spin_unlock(&ctx->lock); - } -} - -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void __perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); - - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch out PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task, next); -} - -static void task_ctx_sched_out(struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } - } -} - -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } -} - -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - u64 now; - int is_active = ctx->is_active; - - ctx->is_active |= event_type; - if (likely(!ctx->nr_events)) - return; - - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) - ctx_pinned_sched_in(ctx, cpuctx); - - /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) - ctx_flexible_sched_in(ctx, cpuctx); -} - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type, task); -} - -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - perf_ctx_lock(cpuctx, ctx); - perf_pmu_disable(ctx->pmu); - /* - * We want to keep the following priority order: - * cpu pinned (that don't need to move), task pinned, - * cpu flexible, task flexible. - */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - - perf_pmu_enable(ctx->pmu); - perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); -} - -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void __perf_event_task_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); -} - -static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) -{ - u64 frequency = event->attr.sample_freq; - u64 sec = NSEC_PER_SEC; - u64 divisor, dividend; - - int count_fls, nsec_fls, frequency_fls, sec_fls; - - count_fls = fls64(count); - nsec_fls = fls64(nsec); - frequency_fls = fls64(frequency); - sec_fls = 30; - - /* - * We got @count in @nsec, with a target of sample_freq HZ - * the target period becomes: - * - * @count * 10^9 - * period = ------------------- - * @nsec * sample_freq - * - */ - - /* - * Reduce accuracy by one bit such that @a and @b converge - * to a similar magnitude. - */ -#define REDUCE_FLS(a, b) \ -do { \ - if (a##_fls > b##_fls) { \ - a >>= 1; \ - a##_fls--; \ - } else { \ - b >>= 1; \ - b##_fls--; \ - } \ -} while (0) - - /* - * Reduce accuracy until either term fits in a u64, then proceed with - * the other, so that finally we can do a u64/u64 division. - */ - while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - REDUCE_FLS(sec, count); - } - - if (count_fls + sec_fls > 64) { - divisor = nsec * frequency; - - while (count_fls + sec_fls > 64) { - REDUCE_FLS(count, sec); - divisor >>= 1; - } - - dividend = count * sec; - } else { - dividend = count * sec; - - while (nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - dividend >>= 1; - } - - divisor = nsec * frequency; - } - - if (!divisor) - return dividend; - - return div64_u64(dividend, divisor); -} - -static DEFINE_PER_CPU(int, perf_throttled_count); -static DEFINE_PER_CPU(u64, perf_throttled_seq); - -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period, sample_period; - s64 delta; - - period = perf_calculate_period(event, nsec, count); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; - - if (local64_read(&hwc->period_left) > 8*sample_period) { - if (disable) - event->pmu->stop(event, PERF_EF_UPDATE); - - local64_set(&hwc->period_left, 0); - - if (disable) - event->pmu->start(event, PERF_EF_RELOAD); - } -} - -/* - * combine freq adjustment with unthrottling to avoid two passes over the - * events. At the same time, make sure, having freq events does not change - * the rate of unthrottling as that would introduce bias. - */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) -{ - struct perf_event *event; - struct hw_perf_event *hwc; - u64 now, period = TICK_NSEC; - s64 delta; - - /* - * only need to iterate over all events iff: - * - context have events in frequency mode (needs freq adjust) - * - there are events to unthrottle on this cpu - */ - if (!(ctx->nr_freq || needs_unthr)) - return; - - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state != PERF_EVENT_STATE_ACTIVE) - continue; - - if (!event_filter_match(event)) - continue; - - hwc = &event->hw; - - if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - event->pmu->start(event, 0); - } - - if (!event->attr.freq || !event->attr.sample_freq) - continue; - - /* - * stop the event and update event->count - */ - event->pmu->stop(event, PERF_EF_UPDATE); - - now = local64_read(&event->count); - delta = now - hwc->freq_count_stamp; - hwc->freq_count_stamp = now; - - /* - * restart the event - * reload only if value has changed - * we have stopped the event so tell that - * to perf_adjust_period() to avoid stopping it - * twice. - */ - if (delta > 0) - perf_adjust_period(event, period, delta, false); - - event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); - } - - perf_pmu_enable(ctx->pmu); - raw_spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's events: - */ -static void rotate_ctx(struct perf_event_context *ctx) -{ - /* - * Rotate the first entry last of non-pinned groups. Rotation might be - * disabled by the inheritance code. - */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); -} - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) -{ - struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; - - if (cpuctx->ctx.nr_events) { - remove = 0; - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - remove = 0; - if (ctx->nr_events != ctx->nr_active) - rotate = 1; - } - - if (!rotate) - goto done; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_event_sched_in(cpuctx, ctx, current); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - if (remove) - list_del_init(&cpuctx->rotation_list); -} - -void perf_event_task_tick(void) -{ - struct list_head *head = &__get_cpu_var(rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; - int throttled; - - WARN_ON(!irqs_disabled()); - - __this_cpu_inc(perf_throttled_seq); - throttled = __this_cpu_xchg(perf_throttled_count, 0); - - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; - perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); - } -} - -static int event_enable_on_exec(struct perf_event *event, - struct perf_event_context *ctx) -{ - if (!event->attr.enable_on_exec) - return 0; - - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - return 0; - - __perf_event_mark_enabled(event); - - return 1; -} - -/* - * Enable all of a task's events that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) -{ - struct perf_event *event; - unsigned long flags; - int enabled = 0; - int ret; - - local_irq_save(flags); - if (!ctx || !ctx->nr_events) - goto out; - - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - /* - * Unclone this context if we enabled any event. - */ - if (enabled) - unclone_ctx(ctx); - - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); -out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware event - */ -static void __perf_event_read(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * event->count would have been updated to a recent sample - * when the event was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - raw_spin_lock(&ctx->lock); - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); - raw_spin_unlock(&ctx->lock); -} - -static inline u64 perf_event_count(struct perf_event *event) -{ - return local64_read(&event->count) + atomic64_read(&event->child_count); -} - -static u64 perf_event_read(struct perf_event *event) -{ - /* - * If event is enabled and currently active on a CPU, update the - * value in the event structure: - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { - struct perf_event_context *ctx = event->ctx; - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time - */ - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - return perf_event_count(event); -} - -/* - * Initialize the perf_event context in a task_struct: - */ -static void __perf_event_init_context(struct perf_event_context *ctx) -{ - raw_spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); -} - -static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) -{ - struct perf_event_context *ctx; - - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!ctx) - return NULL; - - __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } - ctx->pmu = pmu; - - return ctx; -} - -static struct task_struct * -find_lively_task_by_vpid(pid_t vpid) -{ - struct task_struct *task; - int err; - - rcu_read_lock(); - if (!vpid) - task = current; - else - task = find_task_by_vpid(vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - -} - -/* - * Returns a matching context with refcount and pincount. - */ -static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) -{ - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - unsigned long flags; - int ctxn, err; - - if (!task) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - ++ctx->pin_count; - - return ctx; - } - - err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - -retry: - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - unclone_ctx(ctx); - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } else { - ctx = alloc_perf_context(pmu, task); - err = -ENOMEM; - if (!ctx) - goto errout; - - err = 0; - mutex_lock(&task->perf_event_mutex); - /* - * If it has already passed perf_event_exit_task(). - * we must see PF_EXITING, it takes this mutex too. - */ - if (task->flags & PF_EXITING) - err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) - err = -EAGAIN; - else { - get_ctx(ctx); - ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); - } - mutex_unlock(&task->perf_event_mutex); - - if (unlikely(err)) { - put_ctx(ctx); - - if (err == -EAGAIN) - goto retry; - goto errout; - } - } - - return ctx; - -errout: - return ERR_PTR(err); -} - -static void perf_event_free_filter(struct perf_event *event); - -static void free_event_rcu(struct rcu_head *head) -{ - struct perf_event *event; - - event = container_of(head, struct perf_event, rcu_head); - if (event->ns) - put_pid_ns(event->ns); - perf_event_free_filter(event); - kfree(event); -} - -static void ring_buffer_put(struct ring_buffer *rb); - -static void free_event(struct perf_event *event) -{ - irq_work_sync(&event->pending); - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_dec(&nr_mmap_events); - if (event->attr.comm) - atomic_dec(&nr_comm_events); - if (event->attr.task) - atomic_dec(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - if (is_cgroup_event(event)) { - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); - } - } - - if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; - } - - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - call_rcu(&event->rcu_head, free_event_rcu); -} - -int perf_event_release_kernel(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); - mutex_unlock(&ctx->mutex); - - free_event(event); - - return 0; -} -EXPORT_SYMBOL_GPL(perf_event_release_kernel); - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_event *event = file->private_data; - struct task_struct *owner; - - file->private_data = NULL; - - rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); - /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on - * owner->perf_event_mutex. - */ - smp_read_barrier_depends(); - if (owner) { - /* - * Since delayed_put_task_struct() also drops the last - * task reference we can safely take a new reference - * while holding the rcu_read_lock(). - */ - get_task_struct(owner); - } - rcu_read_unlock(); - - if (owner) { - mutex_lock(&owner->perf_event_mutex); - /* - * We have to re-check the event->owner field, if it is cleared - * we raced with perf_event_exit_task(), acquiring the mutex - * ensured they're done, and we can proceed with freeing the - * event. - */ - if (event->owner) - list_del_init(&event->owner_entry); - mutex_unlock(&owner->perf_event_mutex); - put_task_struct(owner); - } - - return perf_event_release_kernel(event); -} - -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) -{ - struct perf_event *child; - u64 total = 0; - - *enabled = 0; - *running = 0; - - mutex_lock(&event->child_mutex); - total += perf_event_read(event); - *enabled += event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - *running += event->total_time_running + - atomic64_read(&event->child_total_time_running); - - list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); - *enabled += child->total_time_enabled; - *running += child->total_time_running; - } - mutex_unlock(&event->child_mutex); - - return total; -} -EXPORT_SYMBOL_GPL(perf_event_read_value); - -static int perf_event_read_group(struct perf_event *event, - u64 read_format, char __user *buf) -{ - struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; - struct perf_event_context *ctx = leader->ctx; - u64 values[5]; - u64 count, enabled, running; - - mutex_lock(&ctx->mutex); - count = perf_event_read_value(leader, &enabled, &running); - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - goto unlock; - - ret = size; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - size = n * sizeof(u64); - - if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; - } - - ret += size; - } -unlock: - mutex_unlock(&ctx->mutex); - - return ret; -} - -static int perf_event_read_one(struct perf_event *event, - u64 read_format, char __user *buf) -{ - u64 enabled, running; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_read_value(event, &enabled, &running); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance event - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) -{ - u64 read_format = event->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a event that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (event->state == PERF_EVENT_STATE_ERROR) - return 0; - - if (count < event->read_size) - return -ENOSPC; - - WARN_ON_ONCE(event->ctx->parent_ctx); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); - else - ret = perf_event_read_one(event, read_format, buf); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_event *event = file->private_data; - - return perf_read_hw(event, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_event *event = file->private_data; - struct ring_buffer *rb; - unsigned int events = POLL_HUP; - - /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() - */ - mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); - events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - - mutex_unlock(&event->mmap_mutex); - - poll_wait(file, &event->waitq, wait); - - return events; -} - -static void perf_event_reset(struct perf_event *event) -{ - (void)perf_event_read(event); - local64_set(&event->count, 0); - perf_event_update_userpage(event); -} - -/* - * Holding the top-level event's child_mutex means that any - * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the - * task existence requirements of perf_event_enable/disable. - */ -static void perf_event_for_each_child(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event *child; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); - func(event); - list_for_each_entry(child, &event->child_list, child_list) - func(child); - mutex_unlock(&event->child_mutex); -} - -static void perf_event_for_each(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_event *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - event = event->group_leader; - - perf_event_for_each_child(event, func); - func(event); - list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0; - u64 value; - - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; - - raw_spin_lock_irq(&ctx->lock); - if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } -unlock: - raw_spin_unlock_irq(&ctx->lock); - - return ret; -} - -static const struct file_operations perf_fops; - -static struct perf_event *perf_fget_light(int fd, int *fput_needed) -{ - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); - } - - return file->private_data; -} - -static int perf_event_set_output(struct perf_event *event, - struct perf_event *output_event); -static int perf_event_set_filter(struct perf_event *event, void __user *arg); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_event *event = file->private_data; - void (*func)(struct perf_event *); - u32 flags = arg; - - switch (cmd) { - case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; - break; - case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; - break; - case PERF_EVENT_IOC_RESET: - func = perf_event_reset; - break; - - case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); - - case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); - - case PERF_EVENT_IOC_SET_OUTPUT: - { - struct perf_event *output_event = NULL; - int fput_needed = 0; - int ret; - - if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); - } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_event->filp, fput_needed); - - return ret; - } - - case PERF_EVENT_IOC_SET_FILTER: - return perf_event_set_filter(event, (void __user *)arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_event_for_each(event, func); - else - perf_event_for_each_child(event, func); - - return 0; -} - -int perf_event_task_enable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -int perf_event_task_disable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - -static int perf_event_index(struct perf_event *event) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; - - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; -} - -static void calc_timer_values(struct perf_event *event, - u64 *enabled, - u64 *running) -{ - u64 now, ctx_time; - - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_event_update_userpage(struct perf_event *event) -{ - struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; - u64 enabled, running; - - rcu_read_lock(); - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context - */ - calc_timer_values(event, &enabled, &running); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - userpg = rb->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= local64_read(&event->hw.prev_count); - - userpg->time_enabled = enabled + - atomic64_read(&event->child_total_time_enabled); - - userpg->time_running = running + - atomic64_read(&event->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb) -{ - unsigned long flags; - - if (!list_empty(&event->rb_entry)) - return; - - spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: - spin_unlock_irqrestore(&rb->event_lock, flags); -} - -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) -{ - unsigned long flags; - - if (list_empty(&event->rb_entry)) - return; - - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); -} - -static void ring_buffer_wakeup(struct perf_event *event) -{ - struct ring_buffer *rb; - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: - rcu_read_unlock(); -} - -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - -static struct ring_buffer *ring_buffer_get(struct perf_event *event) -{ - struct ring_buffer *rb; - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) - rb = NULL; - } - rcu_read_unlock(); - - return rb; -} - -static void ring_buffer_put(struct ring_buffer *rb) -{ - struct perf_event *event, *n; - unsigned long flags; - - if (!atomic_dec_and_test(&rb->refcount)) - return; - - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); - - call_rcu(&rb->rcu_head, rb_free_rcu); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - atomic_inc(&event->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; - struct ring_buffer *rb = event->rb; - - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - mutex_unlock(&event->mmap_mutex); - - ring_buffer_put(rb); - free_uid(user); - } -} - -static const struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - struct ring_buffer *rb; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0, flags = 0; - - /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(event->rb); - - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; - - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - - if (!rb) { - ret = -ENOMEM; - goto unlock; - } - rcu_assign_pointer(event->rb, rb); - - atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; - -unlock: - if (!ret) - atomic_inc(&event->mmap_count); - mutex_unlock(&event->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_event *event = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .llseek = no_llseek, - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf event wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_event_wakeup(struct perf_event *event) -{ - ring_buffer_wakeup(event); - - if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); - event->pending_kill = 0; - } -} - -static void perf_pending_event(struct irq_work *entry) -{ - struct perf_event *event = container_of(entry, - struct perf_event, pending); - - if (event->pending_disable) { - event->pending_disable = 0; - __perf_event_disable(event); - } - - if (event->pending_wakeup) { - event->pending_wakeup = 0; - perf_event_wakeup(event); - } -} - -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; - -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = cbs; - return 0; -} -EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); - -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); - -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; - header->size += event->id_header_size; - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - } - - if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); - - if (sample_type & PERF_SAMPLE_ID) - data->id = primary_event_id(event); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - data->stream_id = event->id; - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - } -} - -void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); -} - -static void __perf_event__output_id_sample(struct perf_output_handle *handle, - struct perf_sample_data *data) -{ - u64 sample_type = data->type; - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); -} - -void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) -{ - if (event->attr.sample_id_all) - __perf_event__output_id_sample(handle, sample); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - u64 read_format = event->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_count(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = running + - atomic64_read(&event->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - __output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - struct perf_event *leader = event->group_leader, *sub; - u64 read_format = event->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - - if (leader != event) - leader->pmu->read(leader); - - values[n++] = perf_event_count(leader); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - __output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != event) - sub->pmu->read(sub); - - values[n++] = perf_event_count(sub); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - __output_copy(handle, values, n * sizeof(u64)); - } -} - -#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ - PERF_FORMAT_TOTAL_TIME_RUNNING) - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_event *event) -{ - u64 enabled = 0, running = 0; - u64 read_format = event->attr.read_format; - - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context - */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); - - if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event, enabled, running); - else - perf_output_read_one(handle, event, enabled, running); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, event); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - __output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - __output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } - - if (!event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - struct ring_buffer *rb = handle->rb; - int events = local_inc_return(&rb->events); - - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event, - struct pt_regs *regs) -{ - u64 sample_type = event->attr.sample_type; - - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - __perf_event_header__init_id(header, data, event); - - if (sample_type & PERF_SAMPLE_IP) - data->ip = perf_instruction_pointer(regs); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - /* protect the callchain buffers */ - rcu_read_lock(); - - perf_prepare_sample(&header, data, event, regs); - - if (perf_output_begin(&handle, event, header.size)) - goto exit; - - perf_output_sample(&handle, &header, data, event); - - perf_output_end(&handle); - -exit: - rcu_read_unlock(); -} - -/* - * read event_id - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_event_read_event(struct perf_event *event, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct perf_read_event read_event = { - .header = { - .type = PERF_RECORD_READ, - .misc = 0, - .size = sizeof(read_event) + event->read_size, - }, - .pid = perf_event_pid(event, task), - .tid = perf_event_tid(event, task), - }; - int ret; - - perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, event); - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_event_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event_id; -}; - -static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct task_struct *task = task_event->task; - int ret, size = task_event->event_id.header.size; - - perf_event_header__init_id(&task_event->event_id.header, &sample, event); - - ret = perf_output_begin(&handle, event, - task_event->event_id.header.size); - if (ret) - goto out; - - task_event->event_id.pid = perf_event_pid(event, task); - task_event->event_id.ppid = perf_event_pid(event, current); - - task_event->event_id.tid = perf_event_tid(event, task); - task_event->event_id.ptid = perf_event_tid(event, current); - - perf_output_put(&handle, task_event->event_id); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - task_event->event_id.header.size = size; -} - -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - } - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -static void perf_event_task(struct task_struct *task, - struct perf_event_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_events) && - !atomic_read(&nr_mmap_events) && - !atomic_read(&nr_task_events)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event_id = { - .header = { - .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, - .misc = 0, - .size = sizeof(task_event.event_id), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - .time = perf_clock(), - }, - }; - - perf_event_task_event(&task_event); -} - -void perf_event_fork(struct task_struct *task) -{ - perf_event_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event_id; -}; - -static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = comm_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size); - - if (ret) - goto out; - - comm_event->event_id.pid = perf_event_pid(event, comm_event->task); - comm_event->event_id.tid = perf_event_tid(event, comm_event->task); - - perf_output_put(&handle, comm_event->event_id); - __output_copy(&handle, comm_event->comm, - comm_event->comm_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - comm_event->event_id.header.size = size; -} - -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } -} - -static void perf_event_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - char comm[TASK_COMM_LEN]; - unsigned int size; - struct pmu *pmu; - int ctxn; - - memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -void perf_event_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - - if (!atomic_read(&nr_comm_events)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event_id = { - .header = { - .type = PERF_RECORD_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_event_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event_id; -}; - -static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = mmap_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size); - if (ret) - goto out; - - mmap_event->event_id.pid = perf_event_pid(event, current); - mmap_event->event_id.tid = perf_event_tid(event, current); - - perf_output_put(&handle, mmap_event->event_id); - __output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - mmap_event->event_id.header.size = size; -} - -static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } -} - -static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - struct pmu *pmu; - int ctxn; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the rb backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); - - kfree(buf); -} - -void perf_event_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_events)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event_id = { - .header = { - .type = PERF_RECORD_MMAP, - .misc = PERF_RECORD_MISC_USER, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, - }, - }; - - perf_event_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_event *event, int enable) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_RECORD_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_event_id(event), - .stream_id = event->id, - }; - - if (enable) - throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - - perf_event_header__init_id(&throttle_event.header, &sample, event); - - ret = perf_output_begin(&handle, event, - throttle_event.header.size); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_event__output_id_sample(event, &handle, &sample); - perf_output_end(&handle); -} - -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&event->event_limit); - struct hw_perf_event *hwc = &event->hw; - u64 seq; - int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; - - seq = __this_cpu_read(perf_throttled_seq); - if (seq != hwc->interrupts_seq) { - hwc->interrupts_seq = seq; - hwc->interrupts = 1; - } else { - hwc->interrupts++; - if (unlikely(throttle - && hwc->interrupts >= max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } - - if (event->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_time_stamp; - - hwc->freq_time_stamp = now; - - if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period, true); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * events - */ - - event->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&event->event_limit)) { - ret = 1; - event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); - } - - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); - - if (event->fasync && event->pending_kill) { - event->pending_wakeup = 1; - irq_work_queue(&event->pending); - } - - return ret; -} - -int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_event_overflow(event, 1, data, regs); -} - -/* - * Generic software event infrastructure - */ - -struct swevent_htable { - struct swevent_hlist *swevent_hlist; - struct mutex hlist_mutex; - int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; -}; - -static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); - -/* - * We directly increment event->count and keep a second value in - * event->hw.period_left to count intervals. This period event - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swevent_set_period(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - int throttle = 0; - - if (!overflow) - overflow = perf_swevent_set_period(event); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_event_overflow(event, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swevent_event(struct perf_event *event, u64 nr, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - - local64_add(nr, &event->count); - - if (!regs) - return; - - if (!is_sampling_event(event)) - return; - - if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { - data->period = nr; - return perf_swevent_overflow(event, 1, data, regs); - } else - data->period = event->hw.last_period; - - if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, data, regs); - - if (local64_add_negative(nr, &hwc->period_left)) - return; - - perf_swevent_overflow(event, 0, data, regs); -} - -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 1; - - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 1; - - if (event->attr.exclude_kernel && !user_mode(regs)) - return 1; - } - - return 0; -} - -static int perf_swevent_match(struct perf_event *event, - enum perf_type_id type, - u32 event_id, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->attr.type != type) - return 0; - - if (event->attr.config != event_id) - return 0; - - if (perf_exclude_event(event, regs)) - return 0; - - return 1; -} - -static inline u64 swevent_hash(u64 type, u32 event_id) -{ - u64 val = event_id | (type << 32); - - return hash_64(val, SWEVENT_HLIST_BITS); -} - -static inline struct hlist_head * -__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) -{ - u64 hash = swevent_hash(type, event_id); - - return &hlist->heads[hash]; -} - -/* For the read side: events when they trigger */ -static inline struct hlist_head * -find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) -{ - struct swevent_hlist *hlist; - - hlist = rcu_dereference(swhash->swevent_hlist); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -/* For the event head insertion and removal in the hlist */ -static inline struct hlist_head * -find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) -{ - struct swevent_hlist *hlist; - u32 event_id = event->attr.config; - u64 type = event->attr.type; - - /* - * Event scheduling is always serialized against hlist allocation - * and release. Which makes the protected version suitable here. - * The context lock guarantees that. - */ - hlist = rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&event->ctx->lock)); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct perf_event *event; - struct hlist_node *node; - struct hlist_head *head; - - rcu_read_lock(); - head = find_swevent_head_rcu(swhash, type, event_id); - if (!head) - goto end; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, data, regs); - } -end: - rcu_read_unlock(); -} - -int perf_swevent_get_recursion_context(void) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - return get_recursion_context(swhash->recursion); -} -EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - -inline void perf_swevent_put_recursion_context(int rctx) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - put_recursion_context(swhash->recursion, rctx); -} - -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data; - int rctx; - - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - return; - - perf_sample_data_init(&data, addr); - - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); - - perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); -} - -static void perf_swevent_read(struct perf_event *event) -{ -} - -static int perf_swevent_add(struct perf_event *event, int flags) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct hw_perf_event *hwc = &event->hw; - struct hlist_head *head; - - if (is_sampling_event(event)) { - hwc->last_period = hwc->sample_period; - perf_swevent_set_period(event); - } - - hwc->state = !(flags & PERF_EF_START); - - head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) - return -EINVAL; - - hlist_add_head_rcu(&event->hlist_entry, head); - - return 0; -} - -static void perf_swevent_del(struct perf_event *event, int flags) -{ - hlist_del_rcu(&event->hlist_entry); -} - -static void perf_swevent_start(struct perf_event *event, int flags) -{ - event->hw.state = 0; -} - -static void perf_swevent_stop(struct perf_event *event, int flags) -{ - event->hw.state = PERF_HES_STOPPED; -} - -/* Deref the hlist from the update side */ -static inline struct swevent_hlist * -swevent_hlist_deref(struct swevent_htable *swhash) -{ - return rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&swhash->hlist_mutex)); -} - -static void swevent_hlist_release(struct swevent_htable *swhash) -{ - struct swevent_hlist *hlist = swevent_hlist_deref(swhash); - - if (!hlist) - return; - - rcu_assign_pointer(swhash->swevent_hlist, NULL); - kfree_rcu(hlist, rcu_head); -} - -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - - if (!--swhash->hlist_refcount) - swevent_hlist_release(swhash); - - mutex_unlock(&swhash->hlist_mutex); -} - -static void swevent_hlist_put(struct perf_event *event) -{ - int cpu; - - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - - for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); -} - -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - int err = 0; - - mutex_lock(&swhash->hlist_mutex); - - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { - struct swevent_hlist *hlist; - - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - if (!hlist) { - err = -ENOMEM; - goto exit; - } - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - swhash->hlist_refcount++; -exit: - mutex_unlock(&swhash->hlist_mutex); - - return err; -} - -static int swevent_hlist_get(struct perf_event *event) -{ - int err; - int cpu, failed_cpu; - - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - - get_online_cpus(); - for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); - if (err) { - failed_cpu = cpu; - goto fail; - } - } - put_online_cpus(); - - return 0; -fail: - for_each_possible_cpu(cpu) { - if (cpu == failed_cpu) - break; - swevent_hlist_put_cpu(event, cpu); - } - - put_online_cpus(); - return err; -} - -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_event_destroy(struct perf_event *event) -{ - u64 event_id = event->attr.config; - - WARN_ON(event->parent); - - jump_label_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); -} - -static int perf_swevent_init(struct perf_event *event) -{ - int event_id = event->attr.config; - - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - case PERF_COUNT_SW_TASK_CLOCK: - return -ENOENT; - - default: - break; - } - - if (event_id >= PERF_COUNT_SW_MAX) - return -ENOENT; - - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return err; - - jump_label_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; - } - - return 0; -} - -static struct pmu perf_swevent = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_swevent_init, - .add = perf_swevent_add, - .del = perf_swevent_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -#ifdef CONFIG_EVENT_TRACING - -static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - /* - * All tracepoints are from kernel-space. - */ - if (event->attr.exclude_kernel) - return 0; - - if (!perf_tp_filter_match(event, data)) - return 0; - - return 1; -} - -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) -{ - struct perf_sample_data data; - struct perf_event *event; - struct hlist_node *node; - - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } - - perf_swevent_put_recursion_context(rctx); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -static inline void perf_tp_register(void) -{ - perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - -static void perf_event_free_filter(struct perf_event *event) -{ - ftrace_profile_free_filter(event); -} - -#else - -static inline void perf_tp_register(void) -{ -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - -static void perf_event_free_filter(struct perf_event *event) -{ -} - -#endif /* CONFIG_EVENT_TRACING */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -void perf_bp_event(struct perf_event *bp, void *data) -{ - struct perf_sample_data sample; - struct pt_regs *regs = data; - - perf_sample_data_init(&sample, bp->attr.bp_addr); - - if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, &sample, regs); -} -#endif - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return HRTIMER_NORESTART; - - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && is_idle_task(current))) - if (perf_event_overflow(event, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period; - - if (!is_sampling_event(event)) - return; - - period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; - - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (is_sampling_event(event)) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - local64_set(&hwc->period_left, ktime_to_ns(remaining)); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -static void perf_swevent_init_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (!is_sampling_event(event)) - return; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - - /* - * Since hrtimers have a fixed rate, we can do a static freq->period - * mapping and avoid the whole period adjust feedback stuff. - */ - if (event->attr.freq) { - long freq = event->attr.sample_freq; - - event->attr.sample_period = NSEC_PER_SEC / freq; - hwc->sample_period = event->attr.sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - event->attr.freq = 0; - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_event_update(struct perf_event *event) -{ - s64 prev; - u64 now; - - now = local_clock(); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static void cpu_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, local_clock()); - perf_swevent_start_hrtimer(event); -} - -static void cpu_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); -} - -static int cpu_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - cpu_clock_event_start(event, flags); - - return 0; -} - -static void cpu_clock_event_del(struct perf_event *event, int flags) -{ - cpu_clock_event_stop(event, flags); -} - -static void cpu_clock_event_read(struct perf_event *event) -{ - cpu_clock_event_update(event); -} - -static int cpu_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_cpu_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = cpu_clock_event_init, - .add = cpu_clock_event_add, - .del = cpu_clock_event_del, - .start = cpu_clock_event_start, - .stop = cpu_clock_event_stop, - .read = cpu_clock_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static void task_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, event->ctx->time); - perf_swevent_start_hrtimer(event); -} - -static void task_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); -} - -static int task_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - task_clock_event_start(event, flags); - - return 0; -} - -static void task_clock_event_del(struct perf_event *event, int flags) -{ - task_clock_event_stop(event, PERF_EF_UPDATE); -} - -static void task_clock_event_read(struct perf_event *event) -{ - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; - - task_clock_event_update(event, time); -} - -static int task_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_task_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = task_clock_event_init, - .add = task_clock_event_add, - .del = task_clock_event_del, - .start = task_clock_event_start, - .stop = task_clock_event_stop, - .read = task_clock_event_read, -}; - -static void perf_pmu_nop_void(struct pmu *pmu) -{ -} - -static int perf_pmu_nop_int(struct pmu *pmu) -{ - return 0; -} - -static void perf_pmu_start_txn(struct pmu *pmu) -{ - perf_pmu_disable(pmu); -} - -static int perf_pmu_commit_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); - return 0; -} - -static void perf_pmu_cancel_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); -} - -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static void *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; - } -} - -static void free_pmu_context(struct pmu *pmu) -{ - struct pmu *i; - - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - - free_percpu(pmu->pmu_cpu_context); -out: - mutex_unlock(&pmus_lock); -} -static struct idr pmu_idr; - -static ssize_t -type_show(struct device *dev, struct device_attribute *attr, char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); -} - -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, -}; - -static int pmu_bus_running; -static struct bus_type pmu_bus = { - .name = "event_source", - .dev_attrs = pmu_dev_attrs, -}; - -static void pmu_dev_release(struct device *dev) -{ - kfree(dev); -} - -static int pmu_dev_alloc(struct pmu *pmu) -{ - int ret = -ENOMEM; - - pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!pmu->dev) - goto out; - - device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; - - dev_set_drvdata(pmu->dev, pmu); - pmu->dev->bus = &pmu_bus; - pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); - if (ret) - goto free_dev; - -out: - return ret; - -free_dev: - put_device(pmu->dev); - goto out; -} - -static struct lock_class_key cpuctx_mutex; -static struct lock_class_key cpuctx_lock; - -int perf_pmu_register(struct pmu *pmu, char *name, int type) -{ - int cpu, ret; - - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; - - pmu->type = -1; - if (!name) - goto skip_type; - pmu->name = name; - - if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; - goto free_pdc; - } - } - pmu->type = type; - - if (pmu_bus_running) { - ret = pmu_dev_alloc(pmu); - if (ret) - goto free_idr; - } - -skip_type: - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) - goto free_dev; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; - cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; - INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; - } - -got_cpu_context: - if (!pmu->start_txn) { - if (pmu->pmu_enable) { - /* - * If we have pmu_enable/pmu_disable calls, install - * transaction stubs that use that to try and batch - * hardware accesses. - */ - pmu->start_txn = perf_pmu_start_txn; - pmu->commit_txn = perf_pmu_commit_txn; - pmu->cancel_txn = perf_pmu_cancel_txn; - } else { - pmu->start_txn = perf_pmu_nop_void; - pmu->commit_txn = perf_pmu_nop_int; - pmu->cancel_txn = perf_pmu_nop_void; - } - } - - if (!pmu->pmu_enable) { - pmu->pmu_enable = perf_pmu_nop_void; - pmu->pmu_disable = perf_pmu_nop_void; - } - - list_add_rcu(&pmu->entry, &pmus); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_dev: - device_del(pmu->dev); - put_device(pmu->dev); - -free_idr: - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; -} - -void perf_pmu_unregister(struct pmu *pmu) -{ - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); - - /* - * We dereference the pmu list under both SRCU and regular RCU, so - * synchronize against both of those. - */ - synchronize_srcu(&pmus_srcu); - synchronize_rcu(); - - free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - device_del(pmu->dev); - put_device(pmu->dev); - free_pmu_context(pmu); -} - -struct pmu *perf_init_event(struct perf_event *event) -{ - struct pmu *pmu = NULL; - int idx; - int ret; - - idx = srcu_read_lock(&pmus_srcu); - - rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); - rcu_read_unlock(); - if (pmu) { - event->pmu = pmu; - ret = pmu->event_init(event); - if (ret) - pmu = ERR_PTR(ret); - goto unlock; - } - - list_for_each_entry_rcu(pmu, &pmus, entry) { - event->pmu = pmu; - ret = pmu->event_init(event); - if (!ret) - goto unlock; - - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } - } - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - - return pmu; -} - -/* - * Allocate and initialize a event structure - */ -static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler, - void *context) -{ - struct pmu *pmu; - struct perf_event *event; - struct hw_perf_event *hwc; - long err; - - if ((unsigned)cpu >= nr_cpu_ids) { - if (!task || cpu != -1) - return ERR_PTR(-EINVAL); - } - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return ERR_PTR(-ENOMEM); - - /* - * Single events are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = event; - - mutex_init(&event->child_mutex); - INIT_LIST_HEAD(&event->child_list); - - INIT_LIST_HEAD(&event->group_entry); - INIT_LIST_HEAD(&event->event_entry); - INIT_LIST_HEAD(&event->sibling_list); - INIT_LIST_HEAD(&event->rb_entry); - - init_waitqueue_head(&event->waitq); - init_irq_work(&event->pending, perf_pending_event); - - mutex_init(&event->mmap_mutex); - - event->cpu = cpu; - event->attr = *attr; - event->group_leader = group_leader; - event->pmu = NULL; - event->oncpu = -1; - - event->parent = parent_event; - - event->ns = get_pid_ns(current->nsproxy->pid_ns); - event->id = atomic64_inc_return(&perf_event_id); - - event->state = PERF_EVENT_STATE_INACTIVE; - - if (task) { - event->attach_state = PERF_ATTACH_TASK; -#ifdef CONFIG_HAVE_HW_BREAKPOINT - /* - * hw_breakpoint is a bit difficult here.. - */ - if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - } - - if (!overflow_handler && parent_event) { - overflow_handler = parent_event->overflow_handler; - context = parent_event->overflow_handler_context; - } - - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; - - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; - - pmu = NULL; - - hwc = &event->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - local64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited events - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - pmu = perf_init_event(event); - -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (event->ns) - put_pid_ns(event->ns); - kfree(event); - return ERR_PTR(err); - } - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); - if (event->attr.mmap || event->attr.mmap_data) - atomic_inc(&nr_mmap_events); - if (event->attr.comm) - atomic_inc(&nr_comm_events); - if (event->attr.task) - atomic_inc(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); - if (err) { - free_event(event); - return ERR_PTR(err); - } - } - } - - return event; -} - -static int perf_copy_attr(struct perf_event_attr __user *uattr, - struct perf_event_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - if (attr->__reserved_1) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -static int -perf_event_set_output(struct perf_event *event, struct perf_event *output_event) -{ - struct ring_buffer *rb = NULL, *old_rb = NULL; - int ret = -EINVAL; - - if (!output_event) - goto set; - - /* don't allow circular references */ - if (event == output_event) - goto out; - - /* - * Don't allow cross-cpu buffers - */ - if (output_event->cpu != event->cpu) - goto out; - - /* - * If its not a per-cpu rb, it must be the same task. - */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) - goto out; - -set: - mutex_lock(&event->mmap_mutex); - /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) - goto unlock; - - if (output_event) { - /* get the rb we want to redirect to */ - rb = ring_buffer_get(output_event); - if (!rb) - goto unlock; - } - - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); - if (old_rb) - ring_buffer_detach(event, old_rb); - ret = 0; -unlock: - mutex_unlock(&event->mmap_mutex); - - if (old_rb) - ring_buffer_put(old_rb); -out: - return ret; -} - -/** - * sys_perf_event_open - open a performance event, associate it to a task/cpu - * - * @attr_uptr: event_id type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader event fd - */ -SYSCALL_DEFINE5(perf_event_open, - struct perf_event_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_event *group_leader = NULL, *output_event = NULL; - struct perf_event *event, *sibling; - struct perf_event_attr attr; - struct perf_event_context *ctx; - struct file *event_file = NULL; - struct file *group_file = NULL; - struct task_struct *task = NULL; - struct pmu *pmu; - int event_fd; - int move_group = 0; - int fput_needed = 0; - int err; - - /* for future expandability... */ - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_event_sample_rate) - return -EINVAL; - } - - /* - * In cgroup mode, the pid argument is used to pass the fd - * opened to the cgroup directory in cgroupfs. The cpu argument - * designates the cpu on which to monitor threads from that - * cgroup. - */ - if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) - return -EINVAL; - - event_fd = get_unused_fd_flags(O_RDWR); - if (event_fd < 0) - return event_fd; - - if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); - goto err_fd; - } - group_file = group_leader->filp; - if (flags & PERF_FLAG_FD_OUTPUT) - output_event = group_leader; - if (flags & PERF_FLAG_FD_NO_GROUP) - group_leader = NULL; - } - - if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } - } - - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, - NULL, NULL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_task; - } - - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) - goto err_alloc; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); - } - - /* - * Special case software events and allow them to be part of - * any hardware group. - */ - pmu = event->pmu; - - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { - /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. - * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. - */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_alloc; - } - - if (task) { - put_task_struct(task); - task = NULL; - } - - /* - * Look up the group leader (we will attach this event to it): - */ - if (group_leader) { - err = -EINVAL; - - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (move_group) { - if (group_leader->ctx->type != ctx->type) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } - - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - goto err_context; - } - - if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_remove_from_context(sibling); - put_ctx(gctx); - } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } - - event->filp = event_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - - if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); - get_ctx(ctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_install_in_context(ctx, sibling, cpu); - get_ctx(ctx); - } - } - - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - event->owner = current; - - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). - */ - fput_light(group_file, fput_needed); - fd_install(event_fd, event_file); - return event_fd; - -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); -err_alloc: - free_event(event); -err_task: - if (task) - put_task_struct(task); -err_group_fd: - fput_light(group_file, fput_needed); -err_fd: - put_unused_fd(event_fd); - return err; -} - -/** - * perf_event_create_kernel_counter - * - * @attr: attributes of the counter to create - * @cpu: cpu in which the counter is bound - * @task: task to profile (NULL for percpu) - */ -struct perf_event * -perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - perf_overflow_handler_t overflow_handler, - void *context) -{ - struct perf_event_context *ctx; - struct perf_event *event; - int err; - - /* - * Get the target context (task or percpu): - */ - - event = perf_event_alloc(attr, cpu, task, NULL, NULL, - overflow_handler, context); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err; - } - - ctx = find_get_context(event->pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_free; - } - - event->filp = NULL; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - return event; - -err_free: - free_event(event); -err: - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); - -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) -{ - struct perf_event *parent_event = child_event->parent; - u64 child_val; - - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); - - child_val = perf_event_count(child_event); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_event->child_count); - atomic64_add(child_event->total_time_enabled, - &parent_event->child_total_time_enabled); - atomic64_add(child_event->total_time_running, - &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - fput(parent_event->filp); -} - -static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) -{ - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); - - /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. - */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } -} - -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) -{ - struct perf_event *child_event, *tmp; - struct perf_event_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); - child->perf_event_ctxp[ctxn] = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. - */ - unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the events so that we - * won't get any samples after PERF_RECORD_EXIT. We can however still - * get a few PERF_RECORD_READ events. - */ - perf_event_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) -{ - struct perf_event *event, *tmp; - int ctxn; - - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, - owner_entry) { - list_del_init(&event->owner_entry); - - /* - * Ensure the list deletion is visible before we clear - * the owner, closes a race against perf_release() where - * we need to serialize on the owner->perf_event_mutex. - */ - smp_wmb(); - event->owner = NULL; - } - mutex_unlock(&child->perf_event_mutex); - - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - perf_group_detach(event); - list_del_event(event, ctx); - free_event(event); -} - -/* - * free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. - */ -void perf_event_free_task(struct task_struct *task) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); - - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); - - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); - } -} - -void perf_event_delayed_put(struct task_struct *task) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); -} - -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - unsigned long flags; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, - child, - group_leader, parent_event, - NULL, NULL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->ctx = child_ctx; - child_event->overflow_handler = parent_event->overflow_handler; - child_event->overflow_handler_context - = parent_event->overflow_handler_context; - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(child_event); - perf_event__id_header_size(child_event); - - /* - * Link it up in the child's context: - */ - raw_spin_lock_irqsave(&child_ctx->lock, flags); - add_event_to_ctx(child_event, child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static int -inherit_task_group(struct perf_event *event, struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) -{ - int ret; - struct perf_event_context *child_ctx; - - if (!event->attr.inherit) { - *inherited_all = 0; - return 0; - } - - child_ctx = child->perf_event_ctxp[ctxn]; - if (!child_ctx) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = alloc_perf_context(event->pmu, child); - if (!child_ctx) - return -ENOMEM; - - child->perf_event_ctxp[ctxn] = child_ctx; - } - - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - - if (ret) - *inherited_all = 0; - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_context(struct task_struct *child, int ctxn) -{ - struct perf_event_context *child_ctx, *parent_ctx; - struct perf_event_context *cloned_ctx; - struct perf_event *event; - struct task_struct *parent = current; - int inherited_all = 1; - unsigned long flags; - int ret = 0; - - if (likely(!parent->perf_event_ctxp[ctxn])) - return 0; - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent, ctxn); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - /* - * We can't hold ctx->lock when iterating the ->flexible_group list due - * to allocations, but we need to prevent rotation because - * rotate_ctx() will change the list from interrupt context. - */ - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 1; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 0; - - child_ctx = child->perf_event_ctxp[ctxn]; - - if (child_ctx && inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * - * Note that if the parent is a clone, the holding of - * parent_ctx->lock avoids it from being uncloned. - */ - cloned_ctx = parent_ctx->parent_ctx; - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - put_ctx(parent_ctx); - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_task(struct task_struct *child) -{ - int ctxn, ret; - - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); - if (ret) - return ret; - } - - return 0; -} - -static void __init perf_event_init_all_cpus(void) -{ - struct swevent_htable *swhash; - int cpu; - - for_each_possible_cpu(cpu) { - swhash = &per_cpu(swevent_htable, cpu); - mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); - } -} - -static void __cpuinit perf_event_init_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { - struct swevent_hlist *hlist; - - hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); - WARN_ON(!hlist); - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - mutex_unlock(&swhash->hlist_mutex); -} - -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - -static void __perf_event_exit_context(void *__info) -{ - struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; - - perf_pmu_rotate_stop(ctx->pmu); - - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_remove_from_context(event); -} - -static void perf_event_exit_cpu_context(int cpu) -{ - struct perf_event_context *ctx; - struct pmu *pmu; - int idx; - - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - mutex_unlock(&ctx->mutex); - } - srcu_read_unlock(&pmus_srcu, idx); -} - -static void perf_event_exit_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); -} -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif - -static int -perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) -{ - int cpu; - - for_each_online_cpu(cpu) - perf_event_exit_cpu(cpu); - - return NOTIFY_OK; -} - -/* - * Run the perf reboot notifier at the very last possible moment so that - * the generic watchdog code runs as long as possible. - */ -static struct notifier_block perf_reboot_notifier = { - .notifier_call = perf_reboot, - .priority = INT_MIN, -}; - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: - perf_event_init_cpu(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DOWN_PREPARE: - perf_event_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -void __init perf_event_init(void) -{ - int ret; - - idr_init(&pmu_idr); - - perf_event_init_all_cpus(); - init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); - perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); - register_reboot_notifier(&perf_reboot_notifier); - - ret = init_hw_breakpoint(); - WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); -} - -static int __init perf_event_sysfs_init(void) -{ - struct pmu *pmu; - int ret; - - mutex_lock(&pmus_lock); - - ret = bus_register(&pmu_bus); - if (ret) - goto unlock; - - list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) - continue; - - ret = pmu_dev_alloc(pmu); - WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); - } - pmu_bus_running = 1; - ret = 0; - -unlock: - mutex_unlock(&pmus_lock); - - return ret; -} -device_initcall(perf_event_sysfs_init); - -#ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) -{ - struct perf_cgroup *jc; - - jc = kzalloc(sizeof(*jc), GFP_KERNEL); - if (!jc) - return ERR_PTR(-ENOMEM); - - jc->info = alloc_percpu(struct perf_cgroup_info); - if (!jc->info) { - kfree(jc); - return ERR_PTR(-ENOMEM); - } - - return &jc->css; -} - -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), - struct perf_cgroup, css); - free_percpu(jc->info); - kfree(jc); -} - -static int __perf_cgroup_move(void *info) -{ - struct task_struct *task = info; - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); - return 0; -} - -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) -{ - struct task_struct *task; - - cgroup_taskset_for_each(task, cgrp, tset) - task_function_call(task, __perf_cgroup_move, task); -} - -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - task_function_call(task, __perf_cgroup_move, task); -} - -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, - .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, -}; -#endif /* CONFIG_CGROUP_PERF */ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) 2007 Alan Stern - * Copyright (C) IBM Corporation, 2009 - * Copyright (C) 2009, Frederic Weisbecker - * - * Thanks to Ingo Molnar for his many suggestions. - * - * Authors: Alan Stern - * K.Prasad - * Frederic Weisbecker - */ - -/* - * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, - * using the CPU's debug registers. - * This file contains the arch-independent routines. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * Constraints data - */ - -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - -static int nr_slots[TYPE_MAX]; - -/* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); - -static int constraints_initialized; - -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; - -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); - -__weak int hw_breakpoint_weight(struct perf_event *bp) -{ - return 1; -} - -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) -{ - if (bp->attr.bp_type & HW_BREAKPOINT_RW) - return TYPE_DATA; - - return TYPE_INST; -} - -/* - * Report the maximum number of pinned breakpoints a task - * have in this cpu - */ -static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) -{ - int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; -} - -/* - * Count the number of breakpoints of the same type and same task. - * The given event must be not on the list. - */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) -{ - struct task_struct *tsk = bp->hw.bp_target; - struct perf_event *iter; - int count = 0; - - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) - count += hw_breakpoint_weight(iter); - } - - return count; -} - -/* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). - */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } - - for_each_online_cpu(cpu) { - unsigned int nr; - - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - nr += max_task_bp_pinned(cpu, type); - else - nr += task_bp_pinned(bp, type); - - if (nr > slots->pinned) - slots->pinned = nr; - - nr = per_cpu(nr_bp_flexible[type], cpu); - - if (nr > slots->flexible) - slots->flexible = nr; - } -} - -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } -} - -/* - * Add/remove the given breakpoint in our constraint table - */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - /* Pinned counter cpu profiling */ - if (!tsk) { - - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; - return; - } - - /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - - if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } else { - for_each_online_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } - - if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); -} - -/* - * Function to perform processor-specific cleanup during unregistration - */ -__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) -{ - /* - * A weak stub function here for those archs that don't define - * it inside arch/.../kernel/hw_breakpoint.c - */ -} - -/* - * Contraints to check before allowing this new breakpoint counter: - * - * == Non-pinned counter == (Considered as pinned for now) - * - * - If attached to a single cpu, check: - * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM - * - * -> If there are already non-pinned counters in this cpu, it means - * there is already a free slot for them. - * Otherwise, we check that the maximum number of per task - * breakpoints (for this cpu) plus the number of per cpu breakpoint - * (for this cpu) doesn't cover every registers. - * - * - If attached to every cpus, check: - * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM - * - * -> This is roughly the same, except we check the number of per cpu - * bp for every cpu and we keep the max one. Same for the per tasks - * breakpoints. - * - * - * == Pinned counter == - * - * - If attached to a single cpu, check: - * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM - * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep - * one register at least (or they will never be fed). - * - * - If attached to every cpus, check: - * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM - */ -static int __reserve_bp_slot(struct perf_event *bp) -{ - struct bp_busy_slots slots = {0}; - enum bp_type_idx type; - int weight; - - /* We couldn't initialize breakpoint constraints on boot */ - if (!constraints_initialized) - return -ENOMEM; - - /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) - return -EINVAL; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) - return -ENOSPC; - - toggle_bp_slot(bp, true, type, weight); - - return 0; -} - -int reserve_bp_slot(struct perf_event *bp) -{ - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); - - return ret; -} - -static void __release_bp_slot(struct perf_event *bp) -{ - enum bp_type_idx type; - int weight; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); -} - -void release_bp_slot(struct perf_event *bp) -{ - mutex_lock(&nr_bp_mutex); - - arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); -} - -/* - * Allow the kernel debugger to reserve breakpoint slots without - * taking a lock using the dbg_* variant of for the reserve and - * release breakpoint slots. - */ -int dbg_reserve_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - return __reserve_bp_slot(bp); -} - -int dbg_release_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - __release_bp_slot(bp); - - return 0; -} - -static int validate_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; - - if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) - return -EINVAL; - /* - * Don't let unprivileged users set a breakpoint in the trap - * path to avoid trap recursion attacks. - */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - } - - return 0; -} - -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; - - ret = validate_hw_breakpoint(bp); - - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) - release_bp_slot(bp); - - return ret; -} - -/** - * register_user_hw_breakpoint - register a hardware breakpoint for user space - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -struct perf_event * -register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered, - void *context, - struct task_struct *tsk) -{ - return perf_event_create_kernel_counter(attr, -1, tsk, triggered, - context); -} -EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); - -/** - * modify_user_hw_breakpoint - modify a user-space hardware breakpoint - * @bp: the breakpoint structure to modify - * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) -{ - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - - perf_event_disable(bp); - - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; - - err = validate_hw_breakpoint(bp); - if (!err) - perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; - } - -end: - bp->attr.disabled = attr->disabled; - - return 0; -} -EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); - -/** - * unregister_hw_breakpoint - unregister a user-space hardware breakpoint - * @bp: the breakpoint structure to unregister - */ -void unregister_hw_breakpoint(struct perf_event *bp) -{ - if (!bp) - return; - perf_event_release_kernel(bp); -} -EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); - -/** - * register_wide_hw_breakpoint - register a wide breakpoint in the kernel - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * - * @return a set of per_cpu pointers to perf events - */ -struct perf_event * __percpu * -register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered, - void *context) -{ - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; - int cpu; - - cpu_events = alloc_percpu(typeof(*cpu_events)); - if (!cpu_events) - return (void __percpu __force *)ERR_PTR(-ENOMEM); - - get_online_cpus(); - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, NULL, - triggered, context); - - *pevent = bp; - - if (IS_ERR(bp)) { - err = PTR_ERR(bp); - goto fail; - } - } - put_online_cpus(); - - return cpu_events; - -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); - } - put_online_cpus(); - - free_percpu(cpu_events); - return (void __percpu __force *)ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); - -/** - * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel - * @cpu_events: the per cpu set of events to unregister - */ -void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) -{ - int cpu; - struct perf_event **pevent; - - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } - free_percpu(cpu_events); -} -EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); - -static struct notifier_block hw_breakpoint_exceptions_nb = { - .notifier_call = hw_breakpoint_exceptions_notify, - /* we need to be notified first */ - .priority = 0x7fffffff -}; - -static void bp_perf_event_destroy(struct perf_event *event) -{ - release_bp_slot(event); -} - -static int hw_breakpoint_event_init(struct perf_event *bp) -{ - int err; - - if (bp->attr.type != PERF_TYPE_BREAKPOINT) - return -ENOENT; - - err = register_perf_hw_breakpoint(bp); - if (err) - return err; - - bp->destroy = bp_perf_event_destroy; - - return 0; -} - -static int hw_breakpoint_add(struct perf_event *bp, int flags) -{ - if (!(flags & PERF_EF_START)) - bp->hw.state = PERF_HES_STOPPED; - - return arch_install_hw_breakpoint(bp); -} - -static void hw_breakpoint_del(struct perf_event *bp, int flags) -{ - arch_uninstall_hw_breakpoint(bp); -} - -static void hw_breakpoint_start(struct perf_event *bp, int flags) -{ - bp->hw.state = 0; -} - -static void hw_breakpoint_stop(struct perf_event *bp, int flags) -{ - bp->hw.state = PERF_HES_STOPPED; -} - -static struct pmu perf_breakpoint = { - .task_ctx_nr = perf_sw_context, /* could eventually get its own */ - - .event_init = hw_breakpoint_event_init, - .add = hw_breakpoint_add, - .del = hw_breakpoint_del, - .start = hw_breakpoint_start, - .stop = hw_breakpoint_stop, - .read = hw_breakpoint_pmu_read, -}; - -int __init init_hw_breakpoint(void) -{ - unsigned int **task_bp_pinned; - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) - goto err_alloc; - } - } - - constraints_initialized = 1; - - perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); - - return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); - if (err_cpu == cpu) - break; - } - - return -ENOMEM; -} - - -/* - * Performance events ring-buffer code: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include - -#include "internal.h" - -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!rb->writable) - return true; - - mask = perf_data_size(rb) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->rb->poll, POLL_IN); - - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct ring_buffer *rb = handle->rb; - - preempt_disable(); - local_inc(&rb->nest); - handle->wakeup = local_read(&rb->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct ring_buffer *rb = handle->rb; - unsigned long head; - -again: - head = local_read(&rb->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&rb->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the rb->head read and this - * write. - */ - rb->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read rb->head. - */ - if (unlikely(head != local_read(&rb->head))) { - local_inc(&rb->nest); - goto again; - } - - if (handle->wakeup != local_read(&rb->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) -{ - struct ring_buffer *rb; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - rb = rcu_dereference(event->rb); - if (!rb) - goto out; - - handle->rb = rb; - handle->event = event; - - if (!rb->nr_pages) - goto out; - - have_lost = local_read(&rb->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&rb->head); - head += size; - if (unlikely(!perf_output_space(rb, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&rb->head, offset, head) != offset); - - if (head - local_read(&rb->wakeup) > rb->watermark) - local_add(rb->watermark, &rb->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(rb)); - handle->page &= rb->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); - handle->addr = rb->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&rb->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&rb->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - __output_copy(handle, buf, len); -} - -void perf_output_end(struct perf_output_handle *handle) -{ - perf_output_put_handle(handle); - rcu_read_unlock(); -} - -static void -ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) -{ - long max_size = perf_data_size(rb); - - if (watermark) - rb->watermark = min(max_size, watermark); - - if (!rb->watermark) - rb->watermark = max_size / 2; - - if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; - - atomic_set(&rb->refcount, 1); - - INIT_LIST_HEAD(&rb->event_list); - spin_lock_init(&rb->event_lock); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) -{ - if (pgoff > rb->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(rb->user_page); - - return virt_to_page(rb->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct ring_buffer *rb; - unsigned long size; - int i; - - size = sizeof(struct ring_buffer); - size += nr_pages * sizeof(void *); - - rb = kzalloc(size, GFP_KERNEL); - if (!rb) - goto fail; - - rb->user_page = perf_mmap_alloc_page(cpu); - if (!rb->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - rb->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!rb->data_pages[i]) - goto fail_data_pages; - } - - rb->nr_pages = nr_pages; - - ring_buffer_init(rb, watermark, flags); - - return rb; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)rb->data_pages[i]); - - free_page((unsigned long)rb->user_page); - -fail_user_page: - kfree(rb); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -void rb_free(struct ring_buffer *rb) -{ - int i; - - perf_mmap_free_page((unsigned long)rb->user_page); - for (i = 0; i < rb->nr_pages; i++) - perf_mmap_free_page((unsigned long)rb->data_pages[i]); - kfree(rb); -} - -#else - -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(rb))) - return NULL; - - return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void rb_free_work(struct work_struct *work) -{ - struct ring_buffer *rb; - void *base; - int i, nr; - - rb = container_of(work, struct ring_buffer, work); - nr = 1 << page_order(rb); - - base = rb->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(rb); -} - -void rb_free(struct ring_buffer *rb) -{ - schedule_work(&rb->work); -} - -struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct ring_buffer *rb; - unsigned long size; - void *all_buf; - - size = sizeof(struct ring_buffer); - size += sizeof(void *); - - rb = kzalloc(size, GFP_KERNEL); - if (!rb) - goto fail; - - INIT_WORK(&rb->work, rb_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - rb->user_page = all_buf; - rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = 1; - - ring_buffer_init(rb, watermark, flags); - - return rb; - -fail_all_buf: - kfree(rb); - -fail: - return NULL; -} - -#endif -/* - * Handling of different ABIs (personalities). - * - * We group personalities into execution domains which have their - * own handlers for kernel entry points, signal mapping, etc... - * - * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static void default_handler(int, struct pt_regs *); - -static struct exec_domain *exec_domains = &default_exec_domain; -static DEFINE_RWLOCK(exec_domains_lock); - - -static unsigned long ident_map[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 -}; - -struct exec_domain default_exec_domain = { - .name = "Linux", /* name */ - .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ - .pers_high = 0, /* PER_LINUX personality. */ - .signal_map = ident_map, /* Identity map signals. */ - .signal_invmap = ident_map, /* - both ways. */ -}; - - -static void -default_handler(int segment, struct pt_regs *regp) -{ - set_personality(0); - - if (current_thread_info()->exec_domain->handler != default_handler) - current_thread_info()->exec_domain->handler(segment, regp); - else - send_sig(SIGSEGV, current, 1); -} - -static struct exec_domain * -lookup_exec_domain(unsigned int personality) -{ - unsigned int pers = personality(personality); - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } - -#ifdef CONFIG_MODULES - read_unlock(&exec_domains_lock); - request_module("personality-%d", pers); - read_lock(&exec_domains_lock); - - for (ep = exec_domains; ep; ep = ep->next) { - if (pers >= ep->pers_low && pers <= ep->pers_high) - if (try_module_get(ep->module)) - goto out; - } -#endif - - ep = &default_exec_domain; -out: - read_unlock(&exec_domains_lock); - return (ep); -} - -int -register_exec_domain(struct exec_domain *ep) -{ - struct exec_domain *tmp; - int err = -EBUSY; - - if (ep == NULL) - return -EINVAL; - - if (ep->next != NULL) - return -EBUSY; - - write_lock(&exec_domains_lock); - for (tmp = exec_domains; tmp; tmp = tmp->next) { - if (tmp == ep) - goto out; - } - - ep->next = exec_domains; - exec_domains = ep; - err = 0; - -out: - write_unlock(&exec_domains_lock); - return (err); -} - -int -unregister_exec_domain(struct exec_domain *ep) -{ - struct exec_domain **epp; - - epp = &exec_domains; - write_lock(&exec_domains_lock); - for (epp = &exec_domains; *epp; epp = &(*epp)->next) { - if (ep == *epp) - goto unregister; - } - write_unlock(&exec_domains_lock); - return -EINVAL; - -unregister: - *epp = ep->next; - ep->next = NULL; - write_unlock(&exec_domains_lock); - return 0; -} - -int __set_personality(unsigned int personality) -{ - struct exec_domain *oep = current_thread_info()->exec_domain; - - current_thread_info()->exec_domain = lookup_exec_domain(personality); - current->personality = personality; - module_put(oep->module); - - return 0; -} - -#ifdef CONFIG_PROC_FS -static int execdomains_proc_show(struct seq_file *m, void *v) -{ - struct exec_domain *ep; - - read_lock(&exec_domains_lock); - for (ep = exec_domains; ep; ep = ep->next) - seq_printf(m, "%d-%d\t%-16s\t[%s]\n", - ep->pers_low, ep->pers_high, ep->name, - module_name(ep->module)); - read_unlock(&exec_domains_lock); - return 0; -} - -static int execdomains_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, execdomains_proc_show, NULL); -} - -static const struct file_operations execdomains_proc_fops = { - .open = execdomains_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_execdomains_init(void) -{ - proc_create("execdomains", 0, NULL, &execdomains_proc_fops); - return 0; -} -module_init(proc_execdomains_init); -#endif - -SYSCALL_DEFINE1(personality, unsigned int, personality) -{ - unsigned int old = current->personality; - - if (personality != 0xffffffff) - set_personality(personality); - - return old; -} - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); -/* - * linux/kernel/exit.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for audit_free() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static void exit_mm(struct task_struct * tsk); - -static void __unhash_process(struct task_struct *p, bool group_dead) -{ - nr_threads--; - detach_pid(p, PIDTYPE_PID); - if (group_dead) { - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); - - list_del_rcu(&p->tasks); - list_del_init(&p->sibling); - __this_cpu_dec(process_counts); - } - list_del_rcu(&p->thread_group); -} - -/* - * This function expects the tasklist_lock write-locked. - */ -static void __exit_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - bool group_dead = thread_group_leader(tsk); - struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); - - sighand = rcu_dereference_check(tsk->sighand, - lockdep_tasklist_lock_is_held()); - spin_lock(&sighand->siglock); - - posix_cpu_timers_exit(tsk); - if (group_dead) { - posix_cpu_timers_exit_group(tsk); - tty = sig->tty; - sig->tty = NULL; - } else { - /* - * This can only happen if the caller is de_thread(). - * FIXME: this is the temporary hack, we should teach - * posix-cpu-timers to handle this case correctly. - */ - if (unlikely(has_group_leader_pid(tsk))) - posix_cpu_timers_exit_group(tsk); - - /* - * If there is any task waiting for the group exit - * then notify it: - */ - if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); - - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - } - - sig->nr_threads--; - __unhash_process(tsk, group_dead); - - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); - tsk->sighand = NULL; - spin_unlock(&sighand->siglock); - - __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); - tty_kref_put(tty); - } -} - -static void delayed_put_task_struct(struct rcu_head *rhp) -{ - struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); - - perf_event_delayed_put(tsk); - trace_sched_process_free(tsk); - put_task_struct(tsk); -} - - -void release_task(struct task_struct * p) -{ - struct task_struct *leader; - int zap_leader; -repeat: - /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); - rcu_read_unlock(); - - proc_flush_task(p); - - write_lock_irq(&tasklist_lock); - ptrace_release_task(p); - __exit_signal(p); - - /* - * If we are the last non-leader member of the thread - * group, and the leader is zombie, then notify the - * group leader's parent process. (if it wants notification.) - */ - zap_leader = 0; - leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - /* - * If we were the last child thread and the leader has - * exited already, and the leader's parent ignores SIGCHLD, - * then we are the one who should release the leader. - */ - zap_leader = do_notify_parent(leader, leader->exit_signal); - if (zap_leader) - leader->exit_state = EXIT_DEAD; - } - - write_unlock_irq(&tasklist_lock); - release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); - - p = leader; - if (unlikely(zap_leader)) - goto repeat; -} - -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* - * Determine if a process group is "orphaned", according to the POSIX - * definition in 2.2.2.52. Orphaned process groups are not to be affected - * by terminal-generated stop signals. Newly orphaned process groups are - * to receive a SIGHUP and a SIGCONT. - * - * "I ask you, have you ever known what it is to be an orphan?" - */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) -{ - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if ((p == ignored_task) || - (p->exit_state && thread_group_empty(p)) || - is_global_init(p->real_parent)) - continue; - - if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) - return 0; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - - return 1; -} - -int is_current_pgrp_orphaned(void) -{ - int retval; - - read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); - read_unlock(&tasklist_lock); - - return retval; -} - -static bool has_stopped_jobs(struct pid *pgrp) -{ - struct task_struct *p; - - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p->signal->flags & SIGNAL_STOP_STOPPED) - return true; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - - return false; -} - -/* - * Check to see if any process groups have become orphaned as - * a result of our exiting, and if they have any stopped jobs, - * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ -static void -kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) -{ - struct pid *pgrp = task_pgrp(tsk); - struct task_struct *ignored_task = tsk; - - if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ - parent = tsk->real_parent; - else - /* reparent: our child is in a different pgrp than - * we are, and it was the only connection outside. - */ - ignored_task = NULL; - - if (task_pgrp(parent) != pgrp && - task_session(parent) == task_session(tsk) && - will_become_orphaned_pgrp(pgrp, ignored_task) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } -} - -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); - write_unlock_irq(&tasklist_lock); -} - -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - - if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); - - if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); -} - -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - daemonize_fs_struct(); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - -static void close_files(struct files_struct * files) -{ - int i, j; - struct fdtable *fdt; - - j = 0; - - /* - * It is safe to dereference the fd table without RCU or - * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); - for (;;) { - unsigned long set; - i = j * __NFDBITS; - if (i >= fdt->max_fds) - break; - set = fdt->open_fds->fds_bits[j++]; - while (set) { - if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); - if (file) { - filp_close(file, files); - cond_resched(); - } - } - i++; - set >>= 1; - } - } -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - -void put_files_struct(struct files_struct *files) -{ - struct fdtable *fdt; - - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* - * Free the fd and fdset arrays if we expanded them. - * If the fdtable was embedded, pass files for freeing - * at the end of the RCU grace period. Otherwise, - * you can free files immediately. - */ - rcu_read_lock(); - fdt = files_fdtable(files); - if (fdt != &files->fdtab) - kmem_cache_free(files_cachep, files); - free_fdtable(fdt); - rcu_read_unlock(); - } -} - -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ - struct files_struct * files = tsk->files; - - if (files) { - task_lock(tsk); - tsk->files = NULL; - task_unlock(tsk); - put_files_struct(files); - } -} - -#ifdef CONFIG_MM_OWNER -/* - * A task is exiting. If it owned this mm, find a new owner for the mm. - */ -void mm_update_next_owner(struct mm_struct *mm) -{ - struct task_struct *c, *g, *p = current; - -retry: - /* - * If the exiting or execing task is not the owner, it's - * someone else's problem. - */ - if (mm->owner != p) - return; - /* - * The current owner is exiting/execing and there are no other - * candidates. Do not leave the mm pointing to a possibly - * freed task structure. - */ - if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; - return; - } - - read_lock(&tasklist_lock); - /* - * Search in the children - */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search in the siblings - */ - list_for_each_entry(c, &p->real_parent->children, sibling) { - if (c->mm == mm) - goto assign_new_owner; - } - - /* - * Search through everything else. We should not get - * here often - */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - - read_unlock(&tasklist_lock); - /* - * We found no owner yet mm_users > 1: this implies that we are - * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL. - */ - mm->owner = NULL; - return; - -assign_new_owner: - BUG_ON(c == p); - get_task_struct(c); - /* - * The task_lock protects c->mm from changing. - * We always want mm->owner->mm == mm - */ - task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); - if (c->mm != mm) { - task_unlock(c); - put_task_struct(c); - goto retry; - } - mm->owner = c; - task_unlock(c); - put_task_struct(c); -} -#endif /* CONFIG_MM_OWNER */ - -/* - * Turn us into a lazy TLB process if we - * aren't already.. - */ -static void exit_mm(struct task_struct * tsk) -{ - struct mm_struct *mm = tsk->mm; - struct core_state *core_state; - - mm_release(tsk, mm); - if (!mm) - return; - /* - * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state - * and clearing tsk->mm. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group with ->mm != NULL. - */ - down_read(&mm->mmap_sem); - core_state = mm->core_state; - if (core_state) { - struct core_thread self; - up_read(&mm->mmap_sem); - - self.task = tsk; - self.next = xchg(&core_state->dumper.next, &self); - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); - - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_task_state(tsk, TASK_RUNNING); - down_read(&mm->mmap_sem); - } - atomic_inc(&mm->mm_count); - BUG_ON(mm != tsk->active_mm); - /* more a memory barrier than a real lock */ - task_lock(tsk); - tsk->mm = NULL; - up_read(&mm->mmap_sem); - enter_lazy_tlb(mm, current); - task_unlock(tsk); - mm_update_next_owner(mm); - mmput(mm); -} - -/* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. - */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; - - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; - return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; - } - - return pid_ns->child_reaper; -} - -/* -* Any that need to be release_task'd are put on the @dead list. - */ -static void reparent_leader(struct task_struct *father, struct task_struct *p, - struct list_head *dead) -{ - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) - return; - - /* We don't want people slaying init. */ - p->exit_signal = SIGCHLD; - - /* If it has exited notify the new parent about this child's death. */ - if (!p->ptrace && - p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { - if (do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); - } - } - - kill_orphaned_pgrp(p, father); -} - -static void forget_original_parent(struct task_struct *father) -{ - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); - - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); - - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; - do { - t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); - t->parent = t->real_parent; - } - if (t->pdeath_signal) - group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); - } -} - -/* - * Send signals to all our closest relatives so that they know - * to properly mourn us.. - */ -static void exit_notify(struct task_struct *tsk, int group_dead) -{ - bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); - exit_task_namespaces(tsk); - - write_lock_irq(&tasklist_lock); - if (group_dead) - kill_orphaned_pgrp(tsk->group_leader, NULL); - - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) - tsk->exit_signal = SIGCHLD; - - if (unlikely(tsk->ptrace)) { - int sig = thread_group_leader(tsk) && - thread_group_empty(tsk) && - !ptrace_reparented(tsk) ? - tsk->exit_signal : SIGCHLD; - autoreap = do_notify_parent(tsk, sig); - } else if (thread_group_leader(tsk)) { - autoreap = thread_group_empty(tsk) && - do_notify_parent(tsk, tsk->exit_signal); - } else { - autoreap = true; - } - - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; - - /* mt-exec, de_thread() is waiting for group leader */ - if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); - write_unlock_irq(&tasklist_lock); - - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); -} - -#ifdef CONFIG_DEBUG_STACK_USAGE -static void check_stack_usage(void) -{ - static DEFINE_SPINLOCK(low_water_lock); - static int lowest_to_date = THREAD_SIZE; - unsigned long free; - - free = stack_not_used(current); - - if (free >= lowest_to_date) - return; - - spin_lock(&low_water_lock); - if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); - lowest_to_date = free; - } - spin_unlock(&low_water_lock); -} -#else -static inline void check_stack_usage(void) {} -#endif - -void do_exit(long code) -{ - struct task_struct *tsk = current; - int group_dead; - - profile_task_exit(tsk); - - WARN_ON(blk_needs_flush_plug(tsk)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - set_fs(USER_DS); - - ptrace_event(PTRACE_EVENT_EXIT, code); - - validate_creds_for_do_exit(tsk); - - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - - exit_irq_thread(); - - exit_signals(tsk); /* sets PF_EXITING */ - /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - - acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); - group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) { - hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); - if (tsk->mm) - setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); - } - acct_collect(code, group_dead); - if (group_dead) - tty_audit_exit(); - audit_free(tsk); - - tsk->exit_code = code; - taskstats_exit(tsk, group_dead); - - exit_mm(tsk); - - if (group_dead) - acct_process(); - trace_sched_process_exit(tsk); - - exit_sem(tsk); - exit_shm(tsk); - exit_files(tsk); - exit_fs(tsk); - check_stack_usage(); - exit_thread(); - - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - - cgroup_exit(tsk, 1); - - if (group_dead) - disassociate_ctty(1); - - module_put(task_thread_info(tsk)->exec_domain->module); - - proc_exit_connector(tsk); - - /* - * FIXME: do that only when needed, using sched_exit tracepoint - */ - ptrace_put_breakpoints(tsk); - - exit_notify(tsk, group_dead); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif -#ifdef CONFIG_FUTEX - if (unlikely(current->pi_state_cache)) - kfree(current->pi_state_cache); -#endif - /* - * Make sure we are holding no locks: - */ - debug_check_no_locks_held(tsk); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; - - if (tsk->io_context) - exit_io_context(tsk); - - if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); - - validate_creds_for_do_exit(tsk); - - preempt_disable(); - if (tsk->nr_dirtied) - __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); - exit_rcu(); - - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ -} - -EXPORT_SYMBOL_GPL(do_exit); - -void complete_and_exit(struct completion *comp, long code) -{ - if (comp) - complete(comp); - - do_exit(code); -} - -EXPORT_SYMBOL(complete_and_exit); - -SYSCALL_DEFINE1(exit, int, error_code) -{ - do_exit((error_code&0xff)<<8); -} - -/* - * Take down every thread in the group. This is called by fatal signals - * as well as by sys_exit_group (below). - */ -void -do_group_exit(int exit_code) -{ - struct signal_struct *sig = current->signal; - - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) - exit_code = sig->group_exit_code; - else if (!thread_group_empty(current)) { - struct sighand_struct *const sighand = current->sighand; - spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) - /* Another thread got here before we took the lock. */ - exit_code = sig->group_exit_code; - else { - sig->group_exit_code = exit_code; - sig->flags = SIGNAL_GROUP_EXIT; - zap_other_threads(current); - } - spin_unlock_irq(&sighand->siglock); - } - - do_exit(exit_code); - /* NOTREACHED */ -} - -/* - * this kills every thread in the thread group. Note that any externally - * wait4()-ing process will get the correct exit code - even if this - * thread is not the thread group leader. - */ -SYSCALL_DEFINE1(exit_group, int, error_code) -{ - do_group_exit((error_code & 0xff) << 8); - /* NOTREACHED */ - return 0; -} - -struct wait_opts { - enum pid_type wo_type; - int wo_flags; - struct pid *wo_pid; - - struct siginfo __user *wo_info; - int __user *wo_stat; - struct rusage __user *wo_rusage; - - wait_queue_t child_wait; - int notask_error; -}; - -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - -static int eligible_pid(struct wait_opts *wo, struct task_struct *p) -{ - return wo->wo_type == PIDTYPE_MAX || - task_pid_type(p, wo->wo_type) == wo->wo_pid; -} - -static int eligible_child(struct wait_opts *wo, struct task_struct *p) -{ - if (!eligible_pid(wo, p)) - return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) - return 0; - - return 1; -} - -static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, - pid_t pid, uid_t uid, int why, int status) -{ - struct siginfo __user *infop; - int retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - - put_task_struct(p); - infop = wo->wo_info; - if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval) - retval = pid; - return retval; -} - -/* - * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) -{ - unsigned long state; - int retval, status, traced; - pid_t pid = task_pid_vnr(p); - uid_t uid = __task_cred(p)->uid; - struct siginfo __user *infop; - - if (!likely(wo->wo_flags & WEXITED)) - return 0; - - if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; - int why; - - get_task_struct(p); - read_unlock(&tasklist_lock); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; - } - return wait_noreap_copyout(wo, p, pid, uid, why, status); - } - - /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: - */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); - return 0; - } - - traced = ptrace_reparented(p); - /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. - */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; - unsigned long maxrss; - cputime_t tgutime, tgstime; - - /* - * The resource counters for the group leader are in its - * own task_struct. Those for dead threads in the group - * are in its signal_struct, as are those for the child - * processes it has previously reaped. All these - * accumulate in the parent's signal_struct c* fields. - * - * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. - * - * We use thread_group_times() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. - */ - thread_group_times(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; - psig->cutime += tgutime + sig->cutime; - psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; - psig->cmin_flt += - p->min_flt + sig->min_flt + sig->cmin_flt; - psig->cmaj_flt += - p->maj_flt + sig->maj_flt + sig->cmaj_flt; - psig->cnvcsw += - p->nvcsw + sig->nvcsw + sig->cnvcsw; - psig->cnivcsw += - p->nivcsw + sig->nivcsw + sig->cnivcsw; - psig->cinblock += - task_io_get_inblock(p) + - sig->inblock + sig->cinblock; - psig->coublock += - task_io_get_oublock(p) + - sig->oublock + sig->coublock; - maxrss = max(sig->maxrss, sig->cmaxrss); - if (psig->cmaxrss < maxrss) - psig->cmaxrss = maxrss; - task_io_accounting_add(&psig->ioac, &p->ioac); - task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->real_parent->sighand->siglock); - } - - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. - */ - read_unlock(&tasklist_lock); - - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - status = (p->signal->flags & SIGNAL_GROUP_EXIT) - ? p->signal->group_exit_code : p->exit_code; - if (!retval && wo->wo_stat) - retval = put_user(status, wo->wo_stat); - - infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - - if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; - } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; - } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); - } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - - if (traced) { - write_lock_irq(&tasklist_lock); - /* We dropped tasklist, ptracer could die and untrace */ - ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } - write_unlock_irq(&tasklist_lock); - } - if (p != NULL) - release_task(p); - - return retval; -} - -static int *task_stopped_code(struct task_struct *p, bool ptrace) -{ - if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) - return &p->exit_code; - } else { - if (p->signal->flags & SIGNAL_STOP_STOPPED) - return &p->signal->group_exit_code; - } - return NULL; -} - -/** - * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED - * @wo: wait options - * @ptrace: is the wait for ptrace - * @p: task to wait for - * - * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. - * - * CONTEXT: - * read_lock(&tasklist_lock), which is released if return value is - * non-zero. Also, grabs and releases @p->sighand->siglock. - * - * RETURNS: - * 0 if wait condition didn't exist and search for other wait conditions - * should continue. Non-zero return, -errno on failure and @p's pid on - * success, implies that tasklist_lock is released and wait condition - * search should terminate. - */ -static int wait_task_stopped(struct wait_opts *wo, - int ptrace, struct task_struct *p) -{ - struct siginfo __user *infop; - int retval, exit_code, *p_code, why; - uid_t uid = 0; /* unneeded, required by compiler */ - pid_t pid; - - /* - * Traditionally we see ptrace'd stopped tasks regardless of options. - */ - if (!ptrace && !(wo->wo_flags & WUNTRACED)) - return 0; - - if (!task_stopped_code(p, ptrace)) - return 0; - - exit_code = 0; - spin_lock_irq(&p->sighand->siglock); - - p_code = task_stopped_code(p, ptrace); - if (unlikely(!p_code)) - goto unlock_sig; - - exit_code = *p_code; - if (!exit_code) - goto unlock_sig; - - if (!unlikely(wo->wo_flags & WNOWAIT)) - *p_code = 0; - - uid = task_uid(p); -unlock_sig: - spin_unlock_irq(&p->sighand->siglock); - if (!exit_code) - return 0; - - /* - * Now we are pretty sure this task is interesting. - * Make sure it doesn't get reaped out from under us while we - * give up the lock and then examine it below. We don't want to - * keep holding onto the tasklist_lock while we call getrusage and - * possibly take page faults for user memory. - */ - get_task_struct(p); - pid = task_pid_vnr(p); - why = ptrace ? CLD_TRAPPED : CLD_STOPPED; - read_unlock(&tasklist_lock); - - if (unlikely(wo->wo_flags & WNOWAIT)) - return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - if (!retval && wo->wo_stat) - retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); - - infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; - put_task_struct(p); - - BUG_ON(!retval); - return retval; -} - -/* - * Handle do_wait work for one task in a live, non-stopped state. - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. - */ -static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) -{ - int retval; - pid_t pid; - uid_t uid; - - if (!unlikely(wo->wo_flags & WCONTINUED)) - return 0; - - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) - return 0; - - spin_lock_irq(&p->sighand->siglock); - /* Re-check with the lock held. */ - if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { - spin_unlock_irq(&p->sighand->siglock); - return 0; - } - if (!unlikely(wo->wo_flags & WNOWAIT)) - p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = task_uid(p); - spin_unlock_irq(&p->sighand->siglock); - - pid = task_pid_vnr(p); - get_task_struct(p); - read_unlock(&tasklist_lock); - - if (!wo->wo_info) { - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; - put_task_struct(p); - if (!retval && wo->wo_stat) - retval = put_user(0xffff, wo->wo_stat); - if (!retval) - retval = pid; - } else { - retval = wait_noreap_copyout(wo, p, pid, uid, - CLD_CONTINUED, SIGCONT); - BUG_ON(retval == 0); - } - - return retval; -} - -/* - * Consider @p for a wait by @parent. - * - * -ECHILD should be in ->notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; - * then ->notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int wait_consider_task(struct wait_opts *wo, int ptrace, - struct task_struct *p) -{ - int ret = eligible_child(wo, p); - if (!ret) - return ret; - - ret = security_task_wait(p); - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (wo->notask_error) - wo->notask_error = ret; - return 0; - } - - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { - /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). - */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) - wo->notask_error = 0; - return 0; - } - - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { - /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. - */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } - - /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); - - /* - * Allow access to stopped/continued state via zombie by - * falling through. Clearing of notask_error is complex. - * - * When !@ptrace: - * - * If WEXITED is set, notask_error should naturally be - * cleared. If not, subset of WSTOPPED|WCONTINUED is set, - * so, if there are live subthreads, there are events to - * wait for. If all subthreads are dead, it's still safe - * to clear - this function will be called again in finite - * amount time once all the subthreads are released and - * will then return without clearing. - * - * When @ptrace: - * - * Stopped state is per-task and thus can't change once the - * target task dies. Only continued and exited can happen. - * Clear notask_error if WCONTINUED | WEXITED. - */ - if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) - wo->notask_error = 0; - } else { - /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* - * @p is alive and it's gonna stop, continue or exit, so - * there always is something to wait for. - */ - wo->notask_error = 0; - } - - /* - * Wait for stopped. Depending on @ptrace, different stopped state - * is used and the two don't interact with each other. - */ - ret = wait_task_stopped(wo, ptrace, p); - if (ret) - return ret; - - /* - * Wait for continued. There's only one continued state and the - * ptracer can consume it which can confuse the real parent. Don't - * use WCONTINUED from ptracer. You don't need or want it. - */ - return wait_task_continued(wo, p); -} - -/* - * Do the work of do_wait() for one thread in the group, @tsk. - * - * -ECHILD should be in ->notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. - * Returns zero if the search for a child should continue; then - * ->notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. - */ -static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) -{ - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - int ret = wait_consider_task(wo, 0, p); - if (ret) - return ret; - } - - return 0; -} - -static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) -{ - struct task_struct *p; - - list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, 1, p); - if (ret) - return ret; - } - - return 0; -} - -static int child_wait_callback(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct wait_opts *wo = container_of(wait, struct wait_opts, - child_wait); - struct task_struct *p = key; - - if (!eligible_pid(wo, p)) - return 0; - - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; - - return default_wake_function(wait, mode, sync, key); -} - -void __wake_up_parent(struct task_struct *p, struct task_struct *parent) -{ - __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); -} - -static long do_wait(struct wait_opts *wo) -{ - struct task_struct *tsk; - int retval; - - trace_sched_process_wait(wo->wo_pid); - - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); -repeat: - /* - * If there is nothing that can match our critiera just get out. - * We will clear ->notask_error to zero if we see any child that - * might later match our criteria, even if we are not able to reap - * it yet. - */ - wo->notask_error = -ECHILD; - if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) - goto notask; - - set_current_state(TASK_INTERRUPTIBLE); - read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); - if (retval) - goto end; - - retval = ptrace_do_wait(wo, tsk); - if (retval) - goto end; - - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); - read_unlock(&tasklist_lock); - -notask: - retval = wo->notask_error; - if (!retval && !(wo->wo_flags & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } -end: - __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); - return retval; -} - -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) -{ - struct wait_opts wo; - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) - return -EINVAL; - if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) - return -EINVAL; - - switch (which) { - case P_ALL: - type = PIDTYPE_MAX; - break; - case P_PID: - type = PIDTYPE_PID; - if (upid <= 0) - return -EINVAL; - break; - case P_PGID: - type = PIDTYPE_PGID; - if (upid <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); - - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options; - wo.wo_info = infop; - wo.wo_stat = NULL; - wo.wo_rusage = ru; - ret = do_wait(&wo); - - if (ret > 0) { - ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } - - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); - return ret; -} - -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) -{ - struct wait_opts wo; - struct pid *pid = NULL; - enum pid_type type; - long ret; - - if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| - __WNOTHREAD|__WCLONE|__WALL)) - return -EINVAL; - - if (upid == -1) - type = PIDTYPE_MAX; - else if (upid < 0) { - type = PIDTYPE_PGID; - pid = find_get_pid(-upid); - } else if (upid == 0) { - type = PIDTYPE_PGID; - pid = get_task_pid(current, PIDTYPE_PGID); - } else /* upid > 0 */ { - type = PIDTYPE_PID; - pid = find_get_pid(upid); - } - - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options | WEXITED; - wo.wo_info = NULL; - wo.wo_stat = stat_addr; - wo.wo_rusage = ru; - ret = do_wait(&wo); - put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); - return ret; -} - -#ifdef __ARCH_WANT_SYS_WAITPID - -/* - * sys_waitpid() remains for compatibility. waitpid() should be - * implemented by calling sys_wait4() from libc.a. - */ -SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) -{ - return sys_wait4(pid, stat_addr, options, NULL); -} - -#endif -/* Rewritten by Rusty Russell, on the backs of many others... - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include - -#include -#include - -/* - * mutex protecting text section modification (dynamic code patching). - * some users need to sleep (allocating memory...) while they hold this lock. - * - * NOT exported to modules - patching kernel text is a really delicate matter. - */ -DEFINE_MUTEX(text_mutex); - -extern struct exception_table_entry __start___ex_table[]; -extern struct exception_table_entry __stop___ex_table[]; - -/* Sort the kernel's built-in exception table */ -void __init sort_main_extable(void) -{ - sort_extable(__start___ex_table, __stop___ex_table); -} - -/* Given an address, look for it in the exception tables. */ -const struct exception_table_entry *search_exception_tables(unsigned long addr) -{ - const struct exception_table_entry *e; - - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); - if (!e) - e = search_module_extables(addr); - return e; -} - -static inline int init_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -int core_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) - return 1; - - if (system_state == SYSTEM_BOOTING && - init_kernel_text(addr)) - return 1; - return 0; -} - -/** - * core_kernel_data - tell if addr points to kernel data - * @addr: address to test - * - * Returns true if @addr passed in is from the core kernel data - * section. - * - * Note: On some archs it may return true for core RODATA, and false - * for others. But will always be true for core RW data. - */ -int core_kernel_data(unsigned long addr) -{ - if (addr >= (unsigned long)_sdata && - addr < (unsigned long)_edata) - return 1; - return 0; -} - -int __kernel_text_address(unsigned long addr) -{ - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - /* - * There might be init symbols in saved stacktraces. - * Give those symbols a chance to be printed in - * backtraces (such as lockdep traces). - * - * Since we are after the module-symbols check, there's - * no danger of address overlap: - */ - if (init_kernel_text(addr)) - return 1; - return 0; -} - -int kernel_text_address(unsigned long addr) -{ - if (core_kernel_text(addr)) - return 1; - return is_module_text_address(addr); -} - -/* - * On some architectures (PPC64, IA64) function pointers - * are actually only tokens to some data that then holds the - * real function address. As a result, to find if a function - * pointer is part of the kernel text, we need to do some - * special dereferencing first. - */ -int func_ptr_is_kernel_text(void *ptr) -{ - unsigned long addr; - addr = (unsigned long) dereference_function_descriptor(ptr); - if (core_kernel_text(addr)) - return 1; - return is_module_text_address(addr); -} -/* - * linux/kernel/fork.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * 'fork.c' contains the help-routines for the 'fork' system call - * (see also entry.S and others). - * Fork is rather simple, once you get the hang of it, but the memory - * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#define CREATE_TRACE_POINTS -#include - -/* - * Protected counters by write_lock_irq(&tasklist_lock) - */ -unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ - -int max_threads; /* tunable limit on nr_threads */ - -DEFINE_PER_CPU(unsigned long, process_counts) = 0; - -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ - -#ifdef CONFIG_PROVE_RCU -int lockdep_tasklist_lock_is_held(void) -{ - return lockdep_is_held(&tasklist_lock); -} -EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); -#endif /* #ifdef CONFIG_PROVE_RCU */ - -int nr_processes(void) -{ - int cpu; - int total = 0; - - for_each_possible_cpu(cpu) - total += per_cpu(process_counts, cpu); - - return total; -} - -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct_node(node) \ - kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) -# define free_task_struct(tsk) \ - kmem_cache_free(task_struct_cachep, (tsk)) -static struct kmem_cache *task_struct_cachep; -#endif - -#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, - int node) -{ -#ifdef CONFIG_DEBUG_STACK_USAGE - gfp_t mask = GFP_KERNEL | __GFP_ZERO; -#else - gfp_t mask = GFP_KERNEL; -#endif - struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); - - return page ? page_address(page) : NULL; -} - -static inline void free_thread_info(struct thread_info *ti) -{ - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); -} -#endif - -/* SLAB cache for signal_struct structures (tsk->signal) */ -static struct kmem_cache *signal_cachep; - -/* SLAB cache for sighand_struct structures (tsk->sighand) */ -struct kmem_cache *sighand_cachep; - -/* SLAB cache for files_struct structures (tsk->files) */ -struct kmem_cache *files_cachep; - -/* SLAB cache for fs_struct structures (tsk->fs) */ -struct kmem_cache *fs_cachep; - -/* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; - -/* SLAB cache for mm_struct structures (tsk->mm) */ -static struct kmem_cache *mm_cachep; - -static void account_kernel_stack(struct thread_info *ti, int account) -{ - struct zone *zone = page_zone(virt_to_page(ti)); - - mod_zone_page_state(zone, NR_KERNEL_STACK, account); -} - -void free_task(struct task_struct *tsk) -{ - account_kernel_stack(tsk->stack, -1); - free_thread_info(tsk->stack); - rt_mutex_debug_task_free(tsk); - ftrace_graph_exit_task(tsk); - free_task_struct(tsk); -} -EXPORT_SYMBOL(free_task); - -static inline void free_signal_struct(struct signal_struct *sig) -{ - taskstats_tgid_free(sig); - sched_autogroup_exit(sig); - kmem_cache_free(signal_cachep, sig); -} - -static inline void put_signal_struct(struct signal_struct *sig) -{ - if (atomic_dec_and_test(&sig->sigcnt)) - free_signal_struct(sig); -} - -void __put_task_struct(struct task_struct *tsk) -{ - WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); - WARN_ON(tsk == current); - - exit_creds(tsk); - delayacct_tsk_free(tsk); - put_signal_struct(tsk->signal); - - if (!profile_handoff_task(tsk)) - free_task(tsk); -} -EXPORT_SYMBOL_GPL(__put_task_struct); - -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif - -void __init fork_init(unsigned long mempages) -{ -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -#ifndef ARCH_MIN_TASKALIGN -#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES -#endif - /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); -#endif - - /* do the arch specific task caches init */ - arch_task_cache_init(); - - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if (max_threads < 20) - max_threads = 20; - - init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; - init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; - init_task.signal->rlim[RLIMIT_SIGPENDING] = - init_task.signal->rlim[RLIMIT_NPROC]; -} - -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, - struct task_struct *src) -{ - *dst = *src; - return 0; -} - -static struct task_struct *dup_task_struct(struct task_struct *orig) -{ - struct task_struct *tsk; - struct thread_info *ti; - unsigned long *stackend; - int node = tsk_fork_get_node(orig); - int err; - - prepare_to_copy(orig); - - tsk = alloc_task_struct_node(node); - if (!tsk) - return NULL; - - ti = alloc_thread_info_node(tsk, node); - if (!ti) { - free_task_struct(tsk); - return NULL; - } - - err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; - - tsk->stack = ti; - - setup_thread_stack(tsk, orig); - clear_user_return_notifier(tsk); - clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ - -#ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); -#endif - - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) - */ - atomic_set(&tsk->usage, 2); -#ifdef CONFIG_BLK_DEV_IO_TRACE - tsk->btrace_seq = 0; -#endif - tsk->splice_pipe = NULL; - - account_kernel_stack(ti, 1); - - return tsk; - -out: - free_thread_info(ti); - free_task_struct(tsk); - return NULL; -} - -#ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - struct mempolicy *pol; - - down_write(&oldmm->mmap_sem); - flush_cache_dup_mm(oldmm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - mm->locked_vm = 0; - mm->mmap = NULL; - mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) - goto fail_nomem_policy; - vma_set_policy(tmp, pol); - tmp->vm_mm = mm; - if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); - if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); - flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(pol); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -#define dup_mmap(mm, oldmm) (0) -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - -__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); - -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - -static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; - -static int __init coredump_filter_setup(char *s) -{ - default_dump_filter = - (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & - MMF_DUMP_FILTER_MASK; - return 1; -} - -__setup("coredump_filter=", coredump_filter_setup); - -#include - -static void mm_init_aio(struct mm_struct *mm) -{ -#ifdef CONFIG_AIO - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); -#endif -} - -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) -{ - atomic_set(&mm->mm_users, 1); - atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); - INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; - mm->core_state = NULL; - mm->nr_ptes = 0; - memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); - spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - mm_init_aio(mm); - mm_init_owner(mm, p); - - if (likely(!mm_alloc_pgd(mm))) { - mm->def_flags = 0; - mmu_notifier_mm_init(mm); - return mm; - } - - free_mm(mm); - return NULL; -} - -/* - * Allocate and initialize an mm_struct. - */ -struct mm_struct *mm_alloc(void) -{ - struct mm_struct *mm; - - mm = allocate_mm(); - if (!mm) - return NULL; - - memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); - return mm_init(mm, current); -} - -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(mm->pmd_huge_pte); -#endif - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - -/* - * Decrement the use count and release all resources for an mm. - */ -void mmput(struct mm_struct *mm) -{ - might_sleep(); - - if (atomic_dec_and_test(&mm->mm_users)) { - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - put_swap_token(mm); - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); - } -} -EXPORT_SYMBOL_GPL(mmput); - -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ - if (new_exe_file) - get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ - struct file *exe_file; - - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); - return exe_file; -} - -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - -/** - * get_task_mm - acquire a reference to the task's mm - * - * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning - * this kernel workthread has transiently adopted a user mm with use_mm, - * to do its AIO) is not set and if so returns a reference to it, after - * bumping up the use count. User must release the mm via mmput() - * after use. Typically used by /proc and ptrace. - */ -struct mm_struct *get_task_mm(struct task_struct *task) -{ - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) { - if (task->flags & PF_KTHREAD) - mm = NULL; - else - atomic_inc(&mm->mm_users); - } - task_unlock(task); - return mm; -} -EXPORT_SYMBOL_GPL(get_task_mm); - -struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - -static void complete_vfork_done(struct task_struct *tsk) -{ - struct completion *vfork; - - task_lock(tsk); - vfork = tsk->vfork_done; - if (likely(vfork)) { - tsk->vfork_done = NULL; - complete(vfork); - } - task_unlock(tsk); -} - -static int wait_for_vfork_done(struct task_struct *child, - struct completion *vfork) -{ - int killed; - - freezer_do_not_count(); - killed = wait_for_completion_killable(vfork); - freezer_count(); - - if (killed) { - task_lock(child); - child->vfork_done = NULL; - task_unlock(child); - } - - put_task_struct(child); - return killed; -} - -/* Please note the differences between mmput and mm_release. - * mmput is called whenever we stop holding onto a mm_struct, - * error success whatever. - * - * mm_release is called after a mm_struct has been removed - * from the current process. - * - * This difference is important for error handling, when we - * only half set up a mm_struct for a new process and need to restore - * the old one. Because we mmput the new mm_struct before - * restoring the old one. . . - * Eric Biederman 10 January 1998 - */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) -{ - /* Get rid of any futexes when releasing the mm */ -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -#endif - - /* Get rid of any cached register state */ - deactivate_mm(tsk, mm); - - if (tsk->vfork_done) - complete_vfork_done(tsk); - - /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. - */ - if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && - atomic_read(&mm->mm_users) > 1) { - /* - * We don't check the error code - if userspace has - * not set up a proper pointer then tough luck. - */ - put_user(0, tsk->clear_child_tid); - sys_futex(tsk->clear_child_tid, FUTEX_WAKE, - 1, NULL, NULL, 0); - } - tsk->clear_child_tid = NULL; - } -} - -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. - */ -struct mm_struct *dup_mm(struct task_struct *tsk) -{ - struct mm_struct *mm, *oldmm = current->mm; - int err; - - if (!oldmm) - return NULL; - - mm = allocate_mm(); - if (!mm) - goto fail_nomem; - - memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); - - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - mm->pmd_huge_pte = NULL; -#endif - - if (!mm_init(mm, tsk)) - goto fail_nomem; - - if (init_new_context(tsk, mm)) - goto fail_nocontext; - - dup_mm_exe_file(oldmm, mm); - - err = dup_mmap(mm, oldmm); - if (err) - goto free_pt; - - mm->hiwater_rss = get_mm_rss(mm); - mm->hiwater_vm = mm->total_vm; - - if (mm->binfmt && !try_module_get(mm->binfmt->module)) - goto free_pt; - - return mm; - -free_pt: - /* don't put binfmt in mmput, we haven't got module yet */ - mm->binfmt = NULL; - mmput(mm); - -fail_nomem: - return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; -} - -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) -{ - struct mm_struct *mm, *oldmm; - int retval; - - tsk->min_flt = tsk->maj_flt = 0; - tsk->nvcsw = tsk->nivcsw = 0; -#ifdef CONFIG_DETECT_HUNG_TASK - tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; -#endif - - tsk->mm = NULL; - tsk->active_mm = NULL; - - /* - * Are we cloning a kernel thread? - * - * We need to steal a active VM for that.. - */ - oldmm = current->mm; - if (!oldmm) - return 0; - - if (clone_flags & CLONE_VM) { - atomic_inc(&oldmm->mm_users); - mm = oldmm; - goto good_mm; - } - - retval = -ENOMEM; - mm = dup_mm(tsk); - if (!mm) - goto fail_nomem; - -good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - - tsk->mm = mm; - tsk->active_mm = mm; - return 0; - -fail_nomem: - return retval; -} - -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) -{ - struct fs_struct *fs = current->fs; - if (clone_flags & CLONE_FS) { - /* tsk->fs is already what we want */ - spin_lock(&fs->lock); - if (fs->in_exec) { - spin_unlock(&fs->lock); - return -EAGAIN; - } - fs->users++; - spin_unlock(&fs->lock); - return 0; - } - tsk->fs = copy_fs_struct(fs); - if (!tsk->fs) - return -ENOMEM; - return 0; -} - -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) -{ - struct files_struct *oldf, *newf; - int error = 0; - - /* - * A background process may not have any files ... - */ - oldf = current->files; - if (!oldf) - goto out; - - if (clone_flags & CLONE_FILES) { - atomic_inc(&oldf->count); - goto out; - } - - newf = dup_fd(oldf, &error); - if (!newf) - goto out; - - tsk->files = newf; - error = 0; -out: - return error; -} - -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) -{ - struct sighand_struct *sig; - - if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sighand->count); - return 0; - } - sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); - if (!sig) - return -ENOMEM; - atomic_set(&sig->count, 1); - memcpy(sig->action, current->sighand->action, sizeof(sig->action)); - return 0; -} - -void __cleanup_sighand(struct sighand_struct *sighand) -{ - if (atomic_dec_and_test(&sighand->count)) { - signalfd_cleanup(sighand); - kmem_cache_free(sighand_cachep, sighand); - } -} - - -/* - * Initialize POSIX timer handling for a thread group. - */ -static void posix_cpu_timers_init_group(struct signal_struct *sig) -{ - unsigned long cpu_limit; - - /* Thread group counters. */ - thread_group_cputime_init(sig); - - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); - if (cpu_limit != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); - sig->cputimer.running = 1; - } - - /* The timer lists. */ - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); -} - -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) -{ - struct signal_struct *sig; - - if (clone_flags & CLONE_THREAD) - return 0; - - sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); - tsk->signal = sig; - if (!sig) - return -ENOMEM; - - sig->nr_threads = 1; - atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); - init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; - sig->curr_target = tsk; - init_sigpending(&sig->shared_pending); - INIT_LIST_HEAD(&sig->posix_timers); - - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; - - task_lock(current->group_leader); - memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); - task_unlock(current->group_leader); - - posix_cpu_timers_init_group(sig); - - tty_audit_fork(sig); - sched_autogroup_fork(sig); - -#ifdef CONFIG_CGROUPS - init_rwsem(&sig->group_rwsem); -#endif - - sig->oom_adj = current->signal->oom_adj; - sig->oom_score_adj = current->signal->oom_score_adj; - sig->oom_score_adj_min = current->signal->oom_score_adj_min; - - mutex_init(&sig->cred_guard_mutex); - - return 0; -} - -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - -SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) -{ - current->clear_child_tid = tidptr; - - return task_pid_vnr(current); -} - -static void rt_mutex_init_task(struct task_struct *p) -{ - raw_spin_lock_init(&p->pi_lock); -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); - p->pi_blocked_on = NULL; -#endif -} - -#ifdef CONFIG_MM_OWNER -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MM_OWNER */ - -/* - * Initialize POSIX timer handling for a single task. - */ -static void posix_cpu_timers_init(struct task_struct *tsk) -{ - tsk->cputime_expires.prof_exp = 0; - tsk->cputime_expires.virt_exp = 0; - tsk->cputime_expires.sched_exp = 0; - INIT_LIST_HEAD(&tsk->cpu_timers[0]); - INIT_LIST_HEAD(&tsk->cpu_timers[1]); - INIT_LIST_HEAD(&tsk->cpu_timers[2]); -} - -/* - * This creates a new process as a copy of the old one, - * but does not actually start it yet. - * - * It copies the registers, and all the appropriate - * parts of the process environment (as per the clone - * flags). The actual kick-off is left to the caller. - */ -static struct task_struct *copy_process(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *child_tidptr, - struct pid *pid, - int trace) -{ - int retval; - struct task_struct *p; - int cgroup_callbacks_done = 0; - - if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) - return ERR_PTR(-EINVAL); - - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. - */ - if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) - return ERR_PTR(-EINVAL); - - /* - * Shared signal handlers imply shared VM. By way of the above, - * thread groups also imply shared VM. Blocking this case allows - * for various simplifications in other code. - */ - if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) - return ERR_PTR(-EINVAL); - - /* - * Siblings of global init remain as zombies on exit since they are - * not reaped by their parent (swapper). To solve this and to avoid - * multi-rooted process trees, prevent global and container-inits - * from creating siblings. - */ - if ((clone_flags & CLONE_PARENT) && - current->signal->flags & SIGNAL_UNKILLABLE) - return ERR_PTR(-EINVAL); - - retval = security_task_create(clone_flags); - if (retval) - goto fork_out; - - retval = -ENOMEM; - p = dup_task_struct(current); - if (!p) - goto fork_out; - - ftrace_graph_init_task(p); - - rt_mutex_init_task(p); - -#ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); - DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); -#endif - retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != INIT_USER) - goto bad_fork_free; - } - current->flags &= ~PF_NPROC_EXCEEDED; - - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - - /* - * If multiple threads are within copy_process(), then this check - * triggers too late. This doesn't hurt, the check is only there - * to stop root fork bombs. - */ - retval = -EAGAIN; - if (nr_threads >= max_threads) - goto bad_fork_cleanup_count; - - if (!try_module_get(task_thread_info(p)->exec_domain->module)) - goto bad_fork_cleanup_count; - - p->did_exec = 0; - delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); - INIT_LIST_HEAD(&p->children); - INIT_LIST_HEAD(&p->sibling); - rcu_copy_process(p); - p->vfork_done = NULL; - spin_lock_init(&p->alloc_lock); - - init_sigpending(&p->pending); - - p->utime = p->stime = p->gtime = 0; - p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; -#endif -#if defined(SPLIT_RSS_COUNTING) - memset(&p->rss_stat, 0, sizeof(p->rss_stat)); -#endif - - p->default_timer_slack_ns = current->timer_slack_ns; - - task_io_accounting_init(&p->ioac); - acct_clear_integrals(p); - - posix_cpu_timers_init(p); - - do_posix_clock_monotonic_gettime(&p->start_time); - p->real_start_time = p->start_time; - monotonic_to_bootbased(&p->real_start_time); - p->io_context = NULL; - p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); - cgroup_fork(p); -#ifdef CONFIG_NUMA - p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } - mpol_fix_fork_child_flag(p); -#endif -#ifdef CONFIG_CPUSETS - p->cpuset_mem_spread_rotor = NUMA_NO_NODE; - p->cpuset_slab_spread_rotor = NUMA_NO_NODE; -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else - p->hardirqs_enabled = 0; -#endif - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->hardirq_context = 0; - p->softirq_context = 0; -#endif -#ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; -#endif - -#ifdef CONFIG_DEBUG_MUTEXES - p->blocked_on = NULL; /* not blocked yet */ -#endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif - - /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p); - - retval = perf_event_init_task(p); - if (retval) - goto bad_fork_cleanup_policy; - retval = audit_alloc(p); - if (retval) - goto bad_fork_cleanup_policy; - /* copy all the process information */ - retval = copy_semundo(clone_flags, p); - if (retval) - goto bad_fork_cleanup_audit; - retval = copy_files(clone_flags, p); - if (retval) - goto bad_fork_cleanup_semundo; - retval = copy_fs(clone_flags, p); - if (retval) - goto bad_fork_cleanup_files; - retval = copy_sighand(clone_flags, p); - if (retval) - goto bad_fork_cleanup_fs; - retval = copy_signal(clone_flags, p); - if (retval) - goto bad_fork_cleanup_sighand; - retval = copy_mm(clone_flags, p); - if (retval) - goto bad_fork_cleanup_signal; - retval = copy_namespaces(clone_flags, p); - if (retval) - goto bad_fork_cleanup_mm; - retval = copy_io(clone_flags, p); - if (retval) - goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); - if (retval) - goto bad_fork_cleanup_io; - - if (pid != &init_struct_pid) { - retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); - if (!pid) - goto bad_fork_cleanup_io; - } - - p->pid = pid_nr(pid); - p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) - p->tgid = current->tgid; - - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; -#ifdef CONFIG_BLOCK - p->plug = NULL; -#endif -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif - /* - * sigaltstack should be cleared when sharing the same VM - */ - if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; - - /* - * Syscall tracing and stepping should be turned off in the - * child regardless of CLONE_PTRACE. - */ - user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); -#endif - clear_all_latency_tracing(p); - - /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); - p->pdeath_signal = 0; - p->exit_state = 0; - - p->nr_dirtied = 0; - p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); - p->dirty_paused_when = 0; - - /* - * Ok, make it visible to the rest of the system. - * We dont wake it up yet. - */ - p->group_leader = p; - INIT_LIST_HEAD(&p->thread_group); - - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - - /* Need tasklist lock for parent etc handling! */ - write_lock_irq(&tasklist_lock); - - /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { - p->real_parent = current->real_parent; - p->parent_exec_id = current->parent_exec_id; - } else { - p->real_parent = current; - p->parent_exec_id = current->self_exec_id; - } - - spin_lock(¤t->sighand->siglock); - - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; - } - - if (clone_flags & CLONE_THREAD) { - current->signal->nr_threads++; - atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); - p->group_leader = current->group_leader; - list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - } - - if (likely(p->pid)) { - ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); - - if (thread_group_leader(p)) { - if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; - - p->signal->leader_pid = pid; - p->signal->tty = tty_kref_get(current->signal->tty); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); - list_add_tail(&p->sibling, &p->real_parent->children); - list_add_tail_rcu(&p->tasks, &init_task.tasks); - __this_cpu_inc(process_counts); - } - attach_pid(p, PIDTYPE_PID, pid); - nr_threads++; - } - - total_forks++; - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - proc_fork_connector(p); - cgroup_post_fork(p); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); - perf_event_fork(p); - - trace_task_newtask(p, clone_flags); - - return p; - -bad_fork_free_pid: - if (pid != &init_struct_pid) - free_pid(pid); -bad_fork_cleanup_io: - if (p->io_context) - exit_io_context(p); -bad_fork_cleanup_namespaces: - exit_task_namespaces(p); -bad_fork_cleanup_mm: - if (p->mm) - mmput(p->mm); -bad_fork_cleanup_signal: - if (!(clone_flags & CLONE_THREAD)) - free_signal_struct(p->signal); -bad_fork_cleanup_sighand: - __cleanup_sighand(p->sighand); -bad_fork_cleanup_fs: - exit_fs(p); /* blocking */ -bad_fork_cleanup_files: - exit_files(p); /* blocking */ -bad_fork_cleanup_semundo: - exit_sem(p); -bad_fork_cleanup_audit: - audit_free(p); -bad_fork_cleanup_policy: - perf_event_free_task(p); -#ifdef CONFIG_NUMA - mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: -#endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); - delayacct_tsk_free(p); - module_put(task_thread_info(p)->exec_domain->module); -bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); - exit_creds(p); -bad_fork_free: - free_task(p); -fork_out: - return ERR_PTR(retval); -} - -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - return regs; -} - -static inline void init_idle_pids(struct pid_link *links) -{ - enum pid_type type; - - for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { - INIT_HLIST_NODE(&links[type].node); /* not really needed */ - links[type].pid = &init_struct_pid; - } -} - -struct task_struct * __cpuinit fork_idle(int cpu) -{ - struct task_struct *task; - struct pt_regs regs; - - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); - if (!IS_ERR(task)) { - init_idle_pids(task->pids); - init_idle(task, cpu); - } - - return task; -} - -/* - * Ok, this is the main fork-routine. - * - * It copies the process, and if successful kick-starts - * it and waits for it to finish using the VM if required. - */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - struct pt_regs *regs, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct task_struct *p; - int trace = 0; - long nr; - - /* - * Do some preliminary argument and permissions checking before we - * actually start allocating stuff - */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) - return -EINVAL; - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; - } - - /* - * Determine whether and which event to report to ptracer. When - * called from kernel_thread or CLONE_UNTRACED is explicitly - * requested, no event is reported; otherwise, report if the event - * for the type of forking is enabled. - */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { - if (clone_flags & CLONE_VFORK) - trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) - trace = PTRACE_EVENT_CLONE; - else - trace = PTRACE_EVENT_FORK; - - if (likely(!ptrace_event_enabled(current, trace))) - trace = 0; - } - - p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL, trace); - /* - * Do this prior waking up the new thread - the thread pointer - * might get invalid after that point, if the thread exits quickly. - */ - if (!IS_ERR(p)) { - struct completion vfork; - - trace_sched_process_fork(current, p); - - nr = task_pid_vnr(p); - - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); - - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - get_task_struct(p); - } - - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ - if (unlikely(trace)) - ptrace_event(trace, nr); - - if (clone_flags & CLONE_VFORK) { - if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); - } - } else { - nr = PTR_ERR(p); - } - return nr; -} - -#ifndef ARCH_MIN_MMSTRUCT_ALIGN -#define ARCH_MIN_MMSTRUCT_ALIGN 0 -#endif - -static void sighand_ctor(void *data) -{ - struct sighand_struct *sighand = data; - - spin_lock_init(&sighand->siglock); - init_waitqueue_head(&sighand->signalfd_wqh); -} - -void __init proc_caches_init(void) -{ - sighand_cachep = kmem_cache_create("sighand_cache", - sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); - signal_cachep = kmem_cache_create("signal_cache", - sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - files_cachep = kmem_cache_create("files_cache", - sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - fs_cachep = kmem_cache_create("fs_cache", - sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - /* - * FIXME! The "sizeof(struct mm_struct)" currently includes the - * whole struct cpumask for the OFFSTACK case. We could change - * this to *only* allocate as much of it as required by the - * maximum number of CPU's we can ever have. The cpumask_allocation - * is at the end of the structure, exactly for that reason. - */ - mm_cachep = kmem_cache_create("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); - mmap_init(); - nsproxy_cache_init(); -} - -/* - * Check constraints on flags passed to the unshare system call. - */ -static int check_unshare_flags(unsigned long unshare_flags) -{ - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) - return -EINVAL; - /* - * Not implemented, but pretend it works if there is nothing to - * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND - * needs to unshare vm. - */ - if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { - /* FIXME: get_task_mm() increments ->mm_users */ - if (atomic_read(¤t->mm->mm_users) > 1) - return -EINVAL; - } - - return 0; -} - -/* - * Unshare the filesystem structure if it is being shared - */ -static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) -{ - struct fs_struct *fs = current->fs; - - if (!(unshare_flags & CLONE_FS) || !fs) - return 0; - - /* don't need lock here; in the worst case we'll do useless copy */ - if (fs->users == 1) - return 0; - - *new_fsp = copy_fs_struct(fs); - if (!*new_fsp) - return -ENOMEM; - - return 0; -} - -/* - * Unshare file descriptor table if it is being shared - */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) -{ - struct files_struct *fd = current->files; - int error = 0; - - if ((unshare_flags & CLONE_FILES) && - (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); - if (!*new_fdp) - return error; - } - - return 0; -} - -/* - * unshare allows a process to 'unshare' part of the process - * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly - * because they modify an inactive task_struct that is being - * constructed. Here we are modifying the current, active, - * task_struct. - */ -SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) -{ - struct fs_struct *fs, *new_fs = NULL; - struct files_struct *fd, *new_fd = NULL; - struct nsproxy *new_nsproxy = NULL; - int do_sysvsem = 0; - int err; - - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - - /* - * If unsharing namespace, must also unshare filesystem information. - */ - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - /* - * CLONE_NEWIPC must also detach from the undolist: after switching - * to a new ipc namespace, the semaphore arrays from the old - * namespace are unreachable. - */ - if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) - do_sysvsem = 1; - err = unshare_fs(unshare_flags, &new_fs); - if (err) - goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); - if (err) - goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); - if (err) - goto bad_unshare_cleanup_fd; - - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { - if (do_sysvsem) { - /* - * CLONE_SYSVSEM is equivalent to sys_exit(). - */ - exit_sem(current); - } - - if (new_nsproxy) { - switch_task_namespaces(current, new_nsproxy); - new_nsproxy = NULL; - } - - task_lock(current); - - if (new_fs) { - fs = current->fs; - spin_lock(&fs->lock); - current->fs = new_fs; - if (--fs->users) - new_fs = NULL; - else - new_fs = fs; - spin_unlock(&fs->lock); - } - - if (new_fd) { - fd = current->files; - current->files = new_fd; - new_fd = fd; - } - - task_unlock(current); - } - - if (new_nsproxy) - put_nsproxy(new_nsproxy); - -bad_unshare_cleanup_fd: - if (new_fd) - put_files_struct(new_fd); - -bad_unshare_cleanup_fs: - if (new_fs) - free_fs_struct(new_fs); - -bad_unshare_out: - return err; -} - -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(struct files_struct **displaced) -{ - struct task_struct *task = current; - struct files_struct *copy = NULL; - int error; - - error = unshare_fd(CLONE_FILES, ©); - if (error || !copy) { - *displaced = NULL; - return error; - } - *displaced = task->files; - task_lock(task); - task->files = copy; - task_unlock(task); - return 0; -} -/* - * kernel/freezer.c - Function to freeze a process - * - * Originally from kernel/power/process.c - */ - -#include -#include -#include -#include -#include -#include - -/* total number of freezing conditions in effect */ -atomic_t system_freezing_cnt = ATOMIC_INIT(0); -EXPORT_SYMBOL(system_freezing_cnt); - -/* indicate whether PM freezing is in effect, protected by pm_mutex */ -bool pm_freezing; -bool pm_nosig_freezing; - -/* protects freezing and frozen transitions */ -static DEFINE_SPINLOCK(freezer_lock); - -/** - * freezing_slow_path - slow path for testing whether a task needs to be frozen - * @p: task to be tested - * - * This function is called by freezing() if system_freezing_cnt isn't zero - * and tests whether @p needs to enter and stay in frozen state. Can be - * called under any context. The freezers are responsible for ensuring the - * target tasks see the updated state. - */ -bool freezing_slow_path(struct task_struct *p) -{ - if (p->flags & PF_NOFREEZE) - return false; - - if (pm_nosig_freezing || cgroup_freezing(p)) - return true; - - if (pm_freezing && !(p->flags & PF_KTHREAD)) - return true; - - return false; -} -EXPORT_SYMBOL(freezing_slow_path); - -/* Refrigerator is place where frozen processes are stored :-). */ -bool __refrigerator(bool check_kthr_stop) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - bool was_frozen = false; - long save = current->state; - - pr_debug("%s entered refrigerator\n", current->comm); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - - spin_lock_irq(&freezer_lock); - current->flags |= PF_FROZEN; - if (!freezing(current) || - (check_kthr_stop && kthread_should_stop())) - current->flags &= ~PF_FROZEN; - spin_unlock_irq(&freezer_lock); - - if (!(current->flags & PF_FROZEN)) - break; - was_frozen = true; - schedule(); - } - - pr_debug("%s left refrigerator\n", current->comm); - - /* - * Restore saved task state before returning. The mb'd version - * needs to be used; otherwise, it might silently break - * synchronization which depends on ordered task state change. - */ - set_current_state(save); - - return was_frozen; -} -EXPORT_SYMBOL(__refrigerator); - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - if (lock_task_sighand(p, &flags)) { - signal_wake_up(p, 0); - unlock_task_sighand(p, &flags); - } -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. - * - * RETURNS: - * %false, if @p is not freezing or already frozen; %true, otherwise - */ -bool freeze_task(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (!freezing(p) || frozen(p)) { - spin_unlock_irqrestore(&freezer_lock, flags); - return false; - } - - if (!(p->flags & PF_KTHREAD)) { - fake_signal_wake_up(p); - /* - * fake_signal_wake_up() goes through p's scheduler - * lock and guarantees that TASK_STOPPED/TRACED -> - * TASK_RUNNING transition can't race with task state - * testing in try_to_freeze_tasks(). - */ - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - spin_unlock_irqrestore(&freezer_lock, flags); - return true; -} - -void __thaw_task(struct task_struct *p) -{ - unsigned long flags; - - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ - spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) - wake_up_process(p); - spin_unlock_irqrestore(&freezer_lock, flags); -} - -/** - * set_freezable - make %current freezable - * - * Mark %current freezable and enter refrigerator if necessary. - */ -bool set_freezable(void) -{ - might_sleep(); - - /* - * Modify flags while holding freezer_lock. This ensures the - * freezer notices that we aren't frozen yet or the freezing - * condition is visible to try_to_freeze() below. - */ - spin_lock_irq(&freezer_lock); - current->flags &= ~PF_NOFREEZE; - spin_unlock_irq(&freezer_lock); - - return try_to_freeze(); -} -EXPORT_SYMBOL(set_freezable); -/* - * Fast Userspace Mutexes (which I call "Futexes!"). - * (C) Rusty Russell, IBM 2002 - * - * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar - * (C) Copyright 2003 Red Hat Inc, All Rights Reserved - * - * Removed page pinning, fix privately mapped COW pages and other cleanups - * (C) Copyright 2003, 2004 Jamie Lokier - * - * Robust futex support started by Ingo Molnar - * (C) Copyright 2006 Red Hat Inc, All Rights Reserved - * Thanks to Thomas Gleixner for suggestions, analysis and fixes. - * - * PI-futex support started by Ingo Molnar and Thomas Gleixner - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * PRIVATE futexes by Eric Dumazet - * Copyright (C) 2007 Eric Dumazet - * - * Requeue-PI support by Darren Hart - * Copyright (C) IBM Corporation, 2009 - * Thanks to Thomas Gleixner for conceptual design and careful reviews. - * - * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly - * enough at me, Linus for the original (flawed) idea, Matthew - * Kirkwood for proof-of-concept implementation. - * - * "The futexes are also cursed." - * "But they come in a choice of three flavours!" - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "rtmutex_common.h" - -int __read_mostly futex_cmpxchg_enabled; - -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) - -/* - * Futex flags used to encode options to functions and preserve them across - * restarts. - */ -#define FLAGS_SHARED 0x01 -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - -/* - * Priority Inheritance state: - */ -struct futex_pi_state { - /* - * list of 'owned' pi_state instances - these have to be - * cleaned up in do_exit() if the task exits prematurely: - */ - struct list_head list; - - /* - * The PI object: - */ - struct rt_mutex pi_mutex; - - struct task_struct *owner; - atomic_t refcount; - - union futex_key key; -}; - -/** - * struct futex_q - The hashed futex queue entry, one per waiting task - * @list: priority-sorted list of tasks waiting on this futex - * @task: the task waiting on the futex - * @lock_ptr: the hash bucket lock - * @key: the key the futex is hashed on - * @pi_state: optional priority inheritance state - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup - * - * We use this hashed waitqueue, instead of a normal wait_queue_t, so - * we can wake only the relevant ones (hashed queues may be shared). - * - * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakeup is always to make the first condition true, then - * the second. - * - * PI futexes are typically woken before they are removed from the hash list via - * the rt_mutex code. See unqueue_me_pi(). - */ -struct futex_q { - struct plist_node list; - - struct task_struct *task; - spinlock_t *lock_ptr; - union futex_key key; - struct futex_pi_state *pi_state; - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -}; - -static const struct futex_q futex_q_init = { - /* list gets initialized in queue_me()*/ - .key = FUTEX_KEY_INIT, - .bitset = FUTEX_BITSET_MATCH_ANY -}; - -/* - * Hash buckets are shared by all the futex_keys that hash to the same - * location. Each key may have multiple futex_q structures, one for each task - * waiting on a futex. - */ -struct futex_hash_bucket { - spinlock_t lock; - struct plist_head chain; -}; - -static struct futex_hash_bucket futex_queues[1<both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, - key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; -} - -/* - * Return 1 if two futex_keys are equal, 0 otherwise. - */ -static inline int match_futex(union futex_key *key1, union futex_key *key2) -{ - return (key1 && key2 - && key1->both.word == key2->both.word - && key1->both.ptr == key2->both.ptr - && key1->both.offset == key2->both.offset); -} - -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - ihold(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); - break; - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) { - /* If we're here then we tried to put a key we failed to get */ - WARN_ON_ONCE(1); - return; - } - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - -/** - * get_futex_key() - Get parameters which are the keys for a futex - * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) - * - * Returns a negative error code or 0 - * The key words are stored in *key on success. - * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. - * - * lock_page() might sleep, the caller should not hold a spinlock. - */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) -{ - unsigned long address = (unsigned long)uaddr; - struct mm_struct *mm = current->mm; - struct page *page, *page_head; - int err, ro = 0; - - /* - * The futex address must be "naturally" aligned. - */ - key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) - return -EINVAL; - address -= key->both.offset; - - /* - * PROCESS_PRIVATE futexes are fast. - * As the mm cannot disappear under us and the 'key' only needs - * virtual address, we dont even have to find the underlying vma. - * Note : We do have to check 'uaddr' is a valid user address, - * but access_ok() should be faster than find_vma() - */ - if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; - key->private.mm = mm; - key->private.address = address; - get_futex_key_refs(key); - return 0; - } - -again: - err = get_user_pages_fast(address, 1, 1, &page); - /* - * If write access is not required (eg. FUTEX_WAIT), try - * and get read-only access. - */ - if (err == -EFAULT && rw == VERIFY_READ) { - err = get_user_pages_fast(address, 1, 0, &page); - ro = 1; - } - if (err < 0) - return err; - else - err = 0; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); - - /* - * If page_head->mapping is NULL, then it cannot be a PageAnon - * page; but it might be the ZERO_PAGE or in the gate area or - * in a special mapping (all cases which we are happy to fail); - * or it may have been a good file page when get_user_pages_fast - * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also - * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping - * prevents drop_caches from setting mapping to NULL beneath us. - * - * The case we do have to guard against is when memory pressure made - * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. - */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); - if (shmem_swizzled) - goto again; - return -EFAULT; - } - - /* - * Private mappings are handled in a simple way. - * - * NOTE: When userspace waits on a MAP_SHARED mapping, even if - * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. - */ - if (PageAnon(page_head)) { - /* - * A RO anonymous page will never change and thus doesn't make - * sense for futex operations. - */ - if (ro) { - err = -EFAULT; - goto out; - } - - key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ - key->private.mm = mm; - key->private.address = address; - } else { - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; - key->shared.pgoff = page_head->index; - } - - get_futex_key_refs(key); - -out: - unlock_page(page_head); - put_page(page_head); - return err; -} - -static inline void put_futex_key(union futex_key *key) -{ - drop_futex_key_refs(key); -} - -/** - * fault_in_user_writeable() - Fault in user address and verify RW access - * @uaddr: pointer to faulting user space address - * - * Slow path to fixup the fault we just took in the atomic write - * access to @uaddr. - * - * We have no generic implementation of a non-destructive write to the - * user address. We know that we faulted in the atomic pagefault - * disabled section so we can as well avoid the #PF overhead by - * calling get_user_pages() right away. - */ -static int fault_in_user_writeable(u32 __user *uaddr) -{ - struct mm_struct *mm = current->mm; - int ret; - - down_read(&mm->mmap_sem); - ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); - up_read(&mm->mmap_sem); - - return ret < 0 ? ret : 0; -} - -/** - * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) - * - * Must be called with the hb lock held. - */ -static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, - union futex_key *key) -{ - struct futex_q *this; - - plist_for_each_entry(this, &hb->chain, list) { - if (match_futex(&this->key, key)) - return this; - } - return NULL; -} - -static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, - u32 uval, u32 newval) -{ - int ret; - - pagefault_disable(); - ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); - pagefault_enable(); - - return ret; -} - -static int get_futex_value_locked(u32 *dest, u32 __user *from) -{ - int ret; - - pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); - pagefault_enable(); - - return ret ? -EFAULT : 0; -} - - -/* - * PI code: - */ -static int refill_pi_state_cache(void) -{ - struct futex_pi_state *pi_state; - - if (likely(current->pi_state_cache)) - return 0; - - pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); - - if (!pi_state) - return -ENOMEM; - - INIT_LIST_HEAD(&pi_state->list); - /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - pi_state->key = FUTEX_KEY_INIT; - - current->pi_state_cache = pi_state; - - return 0; -} - -static struct futex_pi_state * alloc_pi_state(void) -{ - struct futex_pi_state *pi_state = current->pi_state_cache; - - WARN_ON(!pi_state); - current->pi_state_cache = NULL; - - return pi_state; -} - -static void free_pi_state(struct futex_pi_state *pi_state) -{ - if (!atomic_dec_and_test(&pi_state->refcount)) - return; - - /* - * If pi_state->owner is NULL, the owner is most probably dying - * and has cleaned up the pi_state already - */ - if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); - } - - if (current->pi_state_cache) - kfree(pi_state); - else { - /* - * pi_state->list is already empty. - * clear pi_state->owner. - * refcount is at 0 - put it back to 1. - */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; - } -} - -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct * futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - get_task_struct(p); - - rcu_read_unlock(); - - return p; -} - -/* - * This task is holding PI mutexes at exit time => bad. - * Kernel cleans up PI-state, but userspace is likely hosed. - * (Robust-futex cleanup is separate and might save the day for userspace.) - */ -void exit_pi_state_list(struct task_struct *curr) -{ - struct list_head *next, *head = &curr->pi_state_list; - struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; - union futex_key key = FUTEX_KEY_INIT; - - if (!futex_cmpxchg_enabled) - return; - /* - * We are a ZOMBIE and nobody can enqueue itself on - * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselves: - */ - raw_spin_lock_irq(&curr->pi_lock); - while (!list_empty(head)) { - - next = head->next; - pi_state = list_entry(next, struct futex_pi_state, list); - key = pi_state->key; - hb = hash_futex(&key); - raw_spin_unlock_irq(&curr->pi_lock); - - spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - spin_unlock(&hb->lock); - continue; - } - - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); - - rt_mutex_unlock(&pi_state->pi_mutex); - - spin_unlock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); - } - raw_spin_unlock_irq(&curr->pi_lock); -} - -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) -{ - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct plist_head *head; - struct task_struct *p; - pid_t pid = uval & FUTEX_TID_MASK; - - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex(&this->key, key)) { - /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non-PI and PI futexes - */ - if (unlikely(!pi_state)) - return -EINVAL; - - WARN_ON(!atomic_read(&pi_state->refcount)); - - /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. - */ - if (pid && pi_state->owner) { - /* - * Bail out if user space manipulated the - * futex value. - */ - if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; - } - - atomic_inc(&pi_state->refcount); - *ps = pi_state; - - return 0; - } - } - - /* - * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 - */ - if (!pid) - return -ESRCH; - p = futex_find_get_task(pid); - if (!p) - return -ESRCH; - - /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: - */ - raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { - /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: - */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - - raw_spin_unlock_irq(&p->pi_lock); - put_task_struct(p); - return ret; - } - - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make 'p' - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - pi_state->owner = p; - raw_spin_unlock_irq(&p->pi_lock); - - put_task_struct(p); - - *ps = pi_state; - - return 0; -} - -/** - * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the - * lookup - * @task: the task to perform the atomic lock work for. This will - * be "current" except in the case of requeue pi. - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock - * <0 - error - * - * The hb->lock and futex_key refs shall be held by the caller. - */ -static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, - union futex_key *key, - struct futex_pi_state **ps, - struct task_struct *task, int set_waiters) -{ - int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval, vpid = task_pid_vnr(task); - -retry: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = vpid; - if (set_waiters) - newval |= FUTEX_WAITERS; - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) - return -EFAULT; - - /* - * Detect deadlocks. - */ - if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) - return -EDEADLK; - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - return 1; - - uval = curval; - - /* - * Set the FUTEX_WAITERS flag, so the owner will know it has someone - * to wake at the next unlock. - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; - lock_taken = 1; - } - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; - if (unlikely(curval != uval)) - goto retry; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - return 1; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, key, ps); - - if (unlikely(ret)) { - switch (ret) { - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - return -EFAULT; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry; - } - default: - break; - } - } - - return ret; -} - -/** - * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be NULL and must be held by the caller. - */ -static void __unqueue_futex(struct futex_q *q) -{ - struct futex_hash_bucket *hb; - - if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) - || WARN_ON(plist_node_empty(&q->list))) - return; - - hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); - plist_del(&q->list, &hb->chain); -} - -/* - * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. - */ -static void wake_futex(struct futex_q *q) -{ - struct task_struct *p = q->task; - - /* - * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non-futex wake up happens on another CPU then the task - * might exit and p would dereference a non-existing task - * struct. Prevent this by holding a reference on p across the - * wake up. - */ - get_task_struct(p); - - __unqueue_futex(q); - /* - * The waiting task can free the futex_q as soon as - * q->lock_ptr = NULL is written, without taking any locks. A - * memory barrier is required here to prevent the following - * store to lock_ptr from getting ahead of the plist_del. - */ - smp_wmb(); - q->lock_ptr = NULL; - - wake_up_state(p, TASK_NORMAL); - put_task_struct(p); -} - -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) -{ - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; - u32 uninitialized_var(curval), newval; - - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock(&pi_state->pi_mutex.wait_lock); - new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - - /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) - */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } - } - - raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - - raw_spin_lock_irq(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); - - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - - return 0; -} - -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 uninitialized_var(oldval); - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) - return -EFAULT; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - -static inline void -double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); -} - -/* - * Wake up waiters matching bitset queued on this futex (uaddr). - */ -static int -futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - struct plist_head *head; - union futex_key key = FUTEX_KEY_INIT; - int ret; - - if (!bitset) - return -EINVAL; - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); - spin_lock(&hb->lock); - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - break; - } - - /* Check if one of the bits is set in both bitsets */ - if (!(this->bitset & bitset)) - continue; - - wake_futex(this); - if (++ret >= nr_wake) - break; - } - } - - spin_unlock(&hb->lock); - put_futex_key(&key); -out: - return ret; -} - -/* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: - */ -static int -futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - int nr_wake, int nr_wake2, int op) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; - struct futex_q *this, *next; - int ret, op_ret; - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out_put_key1; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - - double_unlock_hb(hb1, hb2); - -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { - ret = op_ret; - goto out_put_keys; - } - - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&key2); - put_futex_key(&key1); - goto retry; - } - - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key1)) { - wake_futex(this); - if (++ret >= nr_wake) - break; - } - } - - if (op_ret > 0) { - head = &hb2->chain; - - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &key2)) { - wake_futex(this); - if (++op_ret >= nr_wake2) - break; - } - } - ret += op_ret; - } - - double_unlock_hb(hb1, hb2); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: - return ret; -} - -/** - * requeue_futex() - Requeue a futex_q from one hb to another - * @q: the futex_q to requeue - * @hb1: the source hash_bucket - * @hb2: the target hash_bucket - * @key2: the new key for the requeued futex_q - */ -static inline -void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key2) -{ - - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(&hb1->chain != &hb2->chain)) { - plist_del(&q->list, &hb1->chain); - plist_add(&q->list, &hb2->chain); - q->lock_ptr = &hb2->lock; - } - get_futex_key_refs(key2); - q->key = *key2; -} - -/** - * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * @q: the futex_q - * @key: the key of the requeue target futex - * @hb: the hash_bucket of the requeue target futex - * - * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. - */ -static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, - struct futex_hash_bucket *hb) -{ - get_futex_key_refs(key); - q->key = *key; - - __unqueue_futex(q); - - WARN_ON(!q->rt_waiter); - q->rt_waiter = NULL; - - q->lock_ptr = &hb->lock; - - wake_up_state(q->task, TASK_NORMAL); -} - -/** - * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key - * @ps: address to store the pi_state pointer - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. If the caller specified set_waiters, - * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. - * hb1 and hb2 must be held by the caller. - * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock - * <0 - error - */ -static int futex_proxy_trylock_atomic(u32 __user *pifutex, - struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, - union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps, int set_waiters) -{ - struct futex_q *top_waiter = NULL; - u32 curval; - int ret; - - if (get_futex_value_locked(&curval, pifutex)) - return -EFAULT; - - /* - * Find the top_waiter and determine if there are additional waiters. - * If the caller intends to requeue more than 1 waiter to pifutex, - * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, - * as we have means to handle the possible fault. If not, don't set - * the bit unecessarily as it will force the subsequent unlock to enter - * the kernel. - */ - top_waiter = futex_top_waiter(hb1, key1); - - /* There are no waiters, nothing for us to do. */ - if (!top_waiter) - return 0; - - /* Ensure we requeue to the expected futex. */ - if (!match_futex(top_waiter->requeue_pi_key, key2)) - return -EINVAL; - - /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. - */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - set_waiters); - if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2, hb2); - - return ret; -} - -/** - * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) - * @uaddr2: target futex user address - * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * @nr_requeue: number of waiters to requeue (0-INT_MAX) - * @cmpval: @uaddr1 expected value (or %NULL) - * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) - * - * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire - * uaddr2 atomically on behalf of the top waiter. - * - * Returns: - * >=0 - on success, the number of tasks requeued or woken - * <0 - on error - */ -static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, - u32 *cmpval, int requeue_pi) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; - struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; - struct futex_q *this, *next; - u32 curval2; - - if (requeue_pi) { - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - /* - * requeue_pi must wake as many tasks as it can, up to nr_wake - * + nr_requeue, since it acquires the rt_mutex prior to - * returning to userspace, so as to not leave the rt_mutex with - * waiters and no owner. However, second and third wake-ups - * cannot be predicted as they involve race conditions with the - * first wake and a fault while looking up the pi_state. Both - * pthread_cond_signal() and pthread_cond_broadcast() should - * use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; - } - -retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); - if (unlikely(ret != 0)) - goto out_put_key1; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - double_lock_hb(hb1, hb2); - - if (likely(cmpval != NULL)) { - u32 curval; - - ret = get_futex_value_locked(&curval, uaddr1); - - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - - ret = get_user(curval, uaddr1); - if (ret) - goto out_put_keys; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&key2); - put_futex_key(&key1); - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } - - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, nr_requeue); - - /* - * At this point the top_waiter has either taken uaddr2 or is - * waiting on it. If the former, then the pi_state will not - * exist yet, look it up one more time to ensure we have a - * reference to it. - */ - if (ret == 1) { - WARN_ON(pi_state); - drop_count++; - task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); - } - - switch (ret) { - case 0: - break; - case -EFAULT: - double_unlock_hb(hb1, hb2); - put_futex_key(&key2); - put_futex_key(&key1); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - goto out; - case -EAGAIN: - /* The owner was exiting, try again. */ - double_unlock_hb(hb1, hb2); - put_futex_key(&key2); - put_futex_key(&key1); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!match_futex(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { - ret = -EINVAL; - break; - } - - /* - * Wake nr_wake waiters. For requeue_pi, if we acquired the - * lock, we already woke the top_waiter. If not, it will be - * woken by futex_unlock_pi(). - */ - if (++task_count <= nr_wake && !requeue_pi) { - wake_futex(this); - continue; - } - - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } - - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - */ - if (requeue_pi) { - /* Prepare the waiter to take the rt_mutex. */ - atomic_inc(&pi_state->refcount); - this->pi_state = pi_state; - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task, 1); - if (ret == 1) { - /* We got the lock. */ - requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; - continue; - } else if (ret) { - /* -EDEADLK */ - this->pi_state = NULL; - free_pi_state(pi_state); - goto out_unlock; - } - } - requeue_futex(this, hb1, hb2, &key2); - drop_count++; - } - -out_unlock: - double_unlock_hb(hb1, hb2); - - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: - if (pi_state != NULL) - free_pi_state(pi_state); - return ret ? ret : task_count; -} - -/* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) - __acquires(&hb->lock) -{ - struct futex_hash_bucket *hb; - - hb = hash_futex(&q->key); - q->lock_ptr = &hb->lock; - - spin_lock(&hb->lock); - return hb; -} - -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - spin_unlock(&hb->lock); -} - -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - int prio; - - /* - * The priority used to register this element is - * - either the real thread-priority for the real-time threads - * (i.e. threads with a priority lower than MAX_RT_PRIO) - * - or MAX_RT_PRIO for non-RT threads. - * Thus, all RT-threads are woken first in priority order, and - * the others are woken last, in FIFO order. - */ - prio = min(current->normal_prio, MAX_RT_PRIO); - - plist_node_init(&q->list, prio); - plist_add(&q->list, &hb->chain); - q->task = current; - spin_unlock(&hb->lock); -} - -/** - * unqueue_me() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must - * be paired with exactly one earlier call to queue_me(). - * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) - * 0 - if the futex_q was already removed by the waking thread - */ -static int unqueue_me(struct futex_q *q) -{ - spinlock_t *lock_ptr; - int ret = 0; - - /* In the common case we don't take the spinlock, which is nice. */ -retry: - lock_ptr = q->lock_ptr; - barrier(); - if (lock_ptr != NULL) { - spin_lock(lock_ptr); - /* - * q->lock_ptr can change between reading it and - * spin_lock(), causing us to take the wrong lock. This - * corrects the race condition. - * - * Reasoning goes like this: if we have the wrong lock, - * q->lock_ptr must have changed (maybe several times) - * between reading it and the spin_lock(). It can - * change again after the spin_lock() but only if it was - * already changed before the spin_lock(). It cannot, - * however, change back to the original value. Therefore - * we can detect whether we acquired the correct lock. - */ - if (unlikely(lock_ptr != q->lock_ptr)) { - spin_unlock(lock_ptr); - goto retry; - } - __unqueue_futex(q); - - BUG_ON(q->pi_state); - - spin_unlock(lock_ptr); - ret = 1; - } - - drop_futex_key_refs(&q->key); - return ret; -} - -/* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry - * and dropped here. - */ -static void unqueue_me_pi(struct futex_q *q) - __releases(q->lock_ptr) -{ - __unqueue_futex(q); - - BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); - q->pi_state = NULL; - - spin_unlock(q->lock_ptr); -} - -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) -{ - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; - struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; - u32 uval, uninitialized_var(curval), newval; - int ret; - - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; - - /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. - * We have to replace the newowner TID in the user space variable. - * This must be atomic as we have to preserve the owner died bit here. - * - * Note: We write the user space value _before_ changing the pi_state - * because we can fault here. Imagine swapped out pages or a fork - * that marked all the anonymous memory readonly for cow. - * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. - */ -retry: - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; - - while (1) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; - if (curval == uval) - break; - uval = curval; - } - - /* - * We fixed up user space. Now we need to fix the pi_state - * itself. - */ - if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; - - raw_spin_lock_irq(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); - return 0; - - /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. - */ -handle_fault: - spin_unlock(q->lock_ptr); - - ret = fault_in_user_writeable(uaddr); - - spin_lock(q->lock_ptr); - - /* - * Check if someone else fixed it for us: - */ - if (pi_state->owner != oldowner) - return 0; - - if (ret) - return ret; - - goto retry; -} - -static long futex_wait_restart(struct restart_block *restart); - -/** - * fixup_owner() - Post lock pi_state and corner case management - * @uaddr: user address of the futex - * @q: futex_q (contains pi_state and access to the rt_mutex) - * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) - * - * After attempting to lock an rt_mutex, this function is called to cleanup - * the pi_state owner as well as handle race conditions that may allow us to - * acquire the lock. Must be called with the hb lock held. - * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken - * <0 - on error (-EFAULT) - */ -static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) -{ - struct task_struct *owner; - int ret = 0; - - if (locked) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case: - */ - if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - goto out; - } - - /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - - /* - * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. - */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - -out: - return ret ? ret : locked; -} - -/** - * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller - * @q: the futex_q to queue up on - * @timeout: the prepared hrtimer_sleeper, or null for no timeout - */ -static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) -{ - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using set_mb() and - * queue_me() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE); - queue_me(q, hb); - - /* Arm the timer */ - if (timeout) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - - /* - * If we have been removed from the hash list, then another task - * has tried to wake us, and we can skip the call to schedule(). - */ - if (likely(!plist_node_empty(&q->list))) { - /* - * If the timer has already expired, current will already be - * flagged for rescheduling. Only call schedule if there - * is no timeout, or if it has yet to expire. - */ - if (!timeout || timeout->task) - schedule(); - } - __set_current_state(TASK_RUNNING); -} - -/** - * futex_wait_setup() - Prepare to wait on a futex - * @uaddr: the futex userspace address - * @val: the expected value - * @flags: futex flags (FLAGS_SHARED, etc.) - * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller - * - * Setup the futex_q and locate the hash_bucket. Get the futex value and - * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held and a q.key reference on success, and unlocked - * with no q.key reference on failure. - * - * Returns: - * 0 - uaddr contains val and hb has been locked - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked - */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) -{ - u32 uval; - int ret; - - /* - * Access the page AFTER the hash-bucket is locked. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we locked the hash-bucket after testing *uaddr, that - * would open a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * On the other hand, we insert q and release the hash-bucket only - * after testing *uaddr. This guarantees that futex_wait() will NOT - * absorb a wakeup if *uaddr does not match the desired values - * while the syscall executes. - */ -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); - if (unlikely(ret != 0)) - return ret; - -retry_private: - *hb = queue_lock(q); - - ret = get_futex_value_locked(&uval, uaddr); - - if (ret) { - queue_unlock(q, *hb); - - ret = get_user(uval, uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&q->key); - goto retry; - } - - if (uval != val) { - queue_unlock(q, *hb); - ret = -EWOULDBLOCK; - } - -out: - if (ret) - put_futex_key(&q->key); - return ret; -} - -static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct restart_block *restart; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int ret; - - if (!bitset) - return -EINVAL; - q.bitset = bitset; - - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - -retry: - /* - * Prepare to wait on uaddr. On success, holds hb lock and increments - * q.key refs. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out; - - /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to); - - /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; - /* unqueue_me() drops q.key ref */ - if (!unqueue_me(&q)) - goto out; - ret = -ETIMEDOUT; - if (to && !to->task) - goto out; - - /* - * We expect signal_pending(current), but we might be the - * victim of a spurious wakeup as well. - */ - if (!signal_pending(current)) - goto retry; - - ret = -ERESTARTSYS; - if (!abs_time) - goto out; - - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = abs_time->tv64; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - - ret = -ERESTART_RESTARTBLOCK; - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - - -static long futex_wait_restart(struct restart_block *restart) -{ - u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); -} - - -/* - * Userspace tried a 0 -> TID atomic transition of the futex value - * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) - */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, - ktime_t *time, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int res, ret; - - if (refill_pi_state_cache()) - return -ENOMEM; - - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(&to->timer, *time); - } - -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; - -retry_private: - hb = queue_lock(&q); - - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); - if (unlikely(ret)) { - switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; - case -EFAULT: - goto uaddr_faulted; - case -EAGAIN: - /* - * Task is exiting and we just wait for the - * exit to complete. - */ - queue_unlock(&q, hb); - put_futex_key(&q.key); - cond_resched(); - goto retry; - default: - goto out_unlock_put_key; - } - } - - /* - * Only actually queue now that the atomic ops are done: - */ - queue_me(&q, hb); - - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - } - - spin_lock(q.lock_ptr); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr, &q, !ret); - /* - * If fixup_owner() returned an error, proprogate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q); - - goto out_put_key; - -out_unlock_put_key: - queue_unlock(&q, hb); - -out_put_key: - put_futex_key(&q.key); -out: - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - queue_unlock(&q, hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out_put_key; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&q.key); - goto retry; -} - -/* - * Userspace attempted a TID -> 0 atomic transition, and failed. - * This is the in-kernel slowpath: we look up the PI state (if any), - * and do the rt-mutex unlock. - */ -static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - struct plist_head *head; - union futex_key key = FUTEX_KEY_INIT; - u32 uval, vpid = task_pid_vnr(current); - int ret; - -retry: - if (get_user(uval, uaddr)) - return -EFAULT; - /* - * We release only a lock we actually own: - */ - if ((uval & FUTEX_TID_MASK) != vpid) - return -EPERM; - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); - spin_lock(&hb->lock); - - /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up: - */ - if (!(uval & FUTEX_OWNER_DIED) && - cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == vpid)) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: - */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); - /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: - */ - if (ret == -EFAULT) - goto pi_faulted; - goto out_unlock; - } - /* - * No waiters - kernel unlocks the futex: - */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } - -out_unlock: - spin_unlock(&hb->lock); - put_futex_key(&key); - -out: - return ret; - -pi_faulted: - spin_unlock(&hb->lock); - put_futex_key(&key); - - ret = fault_in_user_writeable(uaddr); - if (!ret) - goto retry; - - return ret; -} - -/** - * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex - * @hb: the hash_bucket futex_q was original enqueued on - * @q: the futex_q woken while waiting to be requeued - * @key2: the futex_key of the requeue target futex - * @timeout: the timeout associated with the wait (NULL if none) - * - * Detect if the task was woken on the initial futex as opposed to the requeue - * target futex. If so, determine if it was a timeout or a signal that caused - * the wakeup and return the appropriate error code to the caller. Must be - * called with the hb lock held. - * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR - */ -static inline -int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, union futex_key *key2, - struct hrtimer_sleeper *timeout) -{ - int ret = 0; - - /* - * With the hb lock held, we avoid races while we process the wakeup. - * We only need to hold hb (and not hb2) to ensure atomicity as the - * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. - * It can't be requeued from uaddr2 to something else since we don't - * support a PI aware source futex for requeue. - */ - if (!match_futex(&q->key, key2)) { - WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - - /* Handle spurious wakeups gracefully */ - ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; - } - return ret; -} - -/** - * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initially wait on (non-pi) - * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. - * @val: the expected value of uaddr - * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) - * @uaddr2: the pi futex we will take prior to returning to user-space - * - * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. - * - * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: - * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue - * 3) signal - * 4) timeout - * - * If 3, cleanup and return -ERESTARTNOINTR. - * - * If 2, we may then block on trying to take the rt_mutex and return via: - * 5) successful lock - * 6) signal - * 7) timeout - * 8) other lock acquisition failure - * - * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). - * - * If 4 or 7, we cleanup and return with -ETIMEDOUT. - * - * Returns: - * 0 - On success - * <0 - On error - */ -static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - u32 val, ktime_t *abs_time, u32 bitset, - u32 __user *uaddr2) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct rt_mutex_waiter rt_waiter; - struct rt_mutex *pi_mutex = NULL; - struct futex_hash_bucket *hb; - union futex_key key2 = FUTEX_KEY_INIT; - struct futex_q q = futex_q_init; - int res, ret; - - if (!bitset) - return -EINVAL; - - if (abs_time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - - /* - * The waiter is allocated on our stack, manipulated by the requeue - * code while we sleep on uaddr. - */ - debug_rt_mutex_init_waiter(&rt_waiter); - rt_waiter.task = NULL; - - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; - - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - q.requeue_pi_key = &key2; - - /* - * Prepare to wait on uaddr. On success, increments q.key (key1) ref - * count. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out_key2; - - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to); - - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out_put_keys; - - /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. - */ - - /* Check if the requeue code acquired the second futex for us. */ - if (!q.rt_waiter) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ - if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current); - spin_unlock(q.lock_ptr); - } - } else { - /* - * We have been woken up by futex_unlock_pi(), a timeout, or a - * signal. futex_unlock_pi() will not destroy the lock_ptr nor - * the pi_state. - */ - WARN_ON(!&q.pi_state); - pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); - debug_rt_mutex_free_waiter(&rt_waiter); - - spin_lock(q.lock_ptr); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr2, &q, !ret); - /* - * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - /* Unqueue and drop the lock. */ - unqueue_me_pi(&q); - } - - /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock the rt_mutex and return the fault to userspace. - */ - if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); - } else if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart by calling - * futex_lock_pi() directly. We could restart this syscall, but - * it would detect that the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart and return - * -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; - } - -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - -/* - * Support for robust futexes: the kernel cleans up held futexes at - * thread exit time. - * - * Implementation: user-space maintains a per-thread list of locks it - * is holding. Upon do_exit(), the kernel carefully walks this list, - * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is - * always manipulated with the lock held, so the list is private and - * per-thread. Userspace also maintains a per-thread 'list_op_pending' - * field, to allow the kernel to clean up if the thread dies after - * acquiring the lock, but just before it could have added itself to - * the list. There can only be one such pending lock. - */ - -/** - * sys_set_robust_list() - Set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects - */ -SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, - size_t, len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - /* - * The kernel knows only one size for now: - */ - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->robust_list = head; - - return 0; -} - -/** - * sys_get_robust_list() - Get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size - */ -SYSCALL_DEFINE3(get_robust_list, int, pid, - struct robust_list_head __user * __user *, head_ptr, - size_t __user *, len_ptr) -{ - struct robust_list_head __user *head; - unsigned long ret; - const struct cred *cred = current_cred(), *pcred; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (!pid) - head = current->robust_list; - else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->robust_list; - rcu_read_unlock(); - } - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(head, head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -/* - * Process a futex-list entry, check whether it's owned by the - * dying task, and do notification if so: - */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) -{ - u32 uval, uninitialized_var(nval), mval; - -retry: - if (get_user(uval, uaddr)) - return -1; - - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { - if (fault_in_user_writeable(uaddr)) - return -1; - goto retry; - } - if (nval != uval) - goto retry; - - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); - } - return 0; -} - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user * __user *head, - unsigned int *pi) -{ - unsigned long uentry; - - if (get_user(uentry, (unsigned long __user *)head)) - return -EFAULT; - - *entry = (void __user *)(uentry & ~1UL); - *pi = uentry & 1; - - return 0; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void exit_robust_list(struct task_struct *curr) -{ - struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); - unsigned long futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * don't process it twice: - */ - if (entry != pending) - if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi)) - return; - if (rc) - return; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); -} - -long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3) -{ - int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; - unsigned int flags = 0; - - if (!(op & FUTEX_PRIVATE_FLAG)) - flags |= FLAGS_SHARED; - - if (op & FUTEX_CLOCK_REALTIME) { - flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) - return -ENOSYS; - } - - switch (cmd) { - case FUTEX_WAIT: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, flags, val, timeout, val3); - break; - case FUTEX_WAKE: - val3 = FUTEX_BITSET_MATCH_ANY; - case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, flags, val, val3); - break; - case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); - break; - case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); - break; - case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); - break; - case FUTEX_LOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, val, timeout, 0); - break; - case FUTEX_UNLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, flags); - break; - case FUTEX_TRYLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); - break; - case FUTEX_WAIT_REQUEUE_PI: - val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, - uaddr2); - break; - case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); - break; - default: - ret = -ENOSYS; - } - return ret; -} - - -SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct timespec __user *, utime, u32 __user *, uaddr2, - u32, val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - u32 val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (copy_from_user(&ts, utime, sizeof(ts)) != 0) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - /* - * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. - * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. - */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} - -static int __init futex_init(void) -{ - u32 curval; - int i; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; - - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); - } - - return 0; -} -__initcall(futex_init); -/* - * linux/kernel/futex_compat.c - * - * Futex compatibililty routines. - * - * Copyright 2006, Red Hat, Inc., Ingo Molnar - */ - -#include -#include -#include -#include - -#include - - -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int -fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, unsigned int *pi) -{ - if (get_user(*uentry, head)) - return -EFAULT; - - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; - - return 0; -} - -static void __user *futex_uaddr(struct robust_list __user *entry, - compat_long_t futex_offset) -{ - compat_uptr_t base = ptr_to_compat(entry); - void __user *uaddr = compat_ptr(base + futex_offset); - - return uaddr; -} - -/* - * Walk curr->robust_list (very carefully, it's a userspace list!) - * and mark any locks found there dead, and notify any waiters. - * - * We silently return on any sign of list-walking problem. - */ -void compat_exit_robust_list(struct task_struct *curr) -{ - struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); - compat_uptr_t uentry, next_uentry, upending; - compat_long_t futex_offset; - int rc; - - if (!futex_cmpxchg_enabled) - return; - - /* - * Fetch the list head (which was registered earlier, via - * sys_set_robust_list()): - */ - if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) - return; - /* - * Fetch the relative futex offset: - */ - if (get_user(futex_offset, &head->futex_offset)) - return; - /* - * Fetch any possibly pending lock-add first, and handle it - * if it exists: - */ - if (fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) - return; - - next_entry = NULL; /* avoid warning with gcc */ - while (entry != (struct robust_list __user *) &head->list) { - /* - * Fetch the next entry in the list before calling - * handle_futex_death: - */ - rc = fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); - /* - * A pending lock might already be on the list, so - * dont process it twice: - */ - if (entry != pending) { - void __user *uaddr = futex_uaddr(entry, futex_offset); - - if (handle_futex_death(uaddr, curr, pi)) - return; - } - if (rc) - return; - uentry = next_uentry; - entry = next_entry; - pi = next_pi; - /* - * Avoid excessively long or circular lists: - */ - if (!--limit) - break; - - cond_resched(); - } - if (pending) { - void __user *uaddr = futex_uaddr(pending, futex_offset); - - handle_futex_death(uaddr, curr, pip); - } -} - -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) -{ - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (unlikely(len != sizeof(*head))) - return -EINVAL; - - current->compat_robust_list = head; - - return 0; -} - -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) -{ - struct compat_robust_list_head __user *head; - unsigned long ret; - const struct cred *cred = current_cred(), *pcred; - - if (!futex_cmpxchg_enabled) - return -ENOSYS; - - if (!pid) - head = current->compat_robust_list; - else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p) - goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->compat_robust_list; - rcu_read_unlock(); - } - - if (put_user(sizeof(*head), len_ptr)) - return -EFAULT; - return put_user(ptr_to_compat(head), head_ptr); - -err_unlock: - rcu_read_unlock(); - - return ret; -} - -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) -{ - struct timespec ts; - ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (get_compat_timespec(&ts, utime)) - return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - - t = timespec_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -} -/* - * This code maintains a list of active profiling data structures. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter - * - * Uses gcc-internal data definitions. - * Based on the gcov-kernel patch by: - * Hubertus Franke - * Nigel Hinds - * Rajan Ravindran - * Peter Oberparleiter - * Paul Larson - */ - -#define pr_fmt(fmt) "gcov: " fmt - -#include -#include -#include -#include "gcov.h" - -static struct gcov_info *gcov_info_head; -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = info->version; - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - info->next = gcov_info_head; - gcov_info_head = info; - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -/** - * gcov_enable_events - enable event reporting through gcov_event() - * - * Turn on reporting of profiling data load/unload-events through the - * gcov_event() callback. Also replay all previous events once. This function - * is needed because some events are potentially generated too early for the - * callback implementation to handle them initially. - */ -void gcov_enable_events(void) -{ - struct gcov_info *info; - - mutex_lock(&gcov_lock); - gcov_events_enabled = 1; - /* Perform event callback for previously registered entries. */ - for (info = gcov_info_head; info; info = info->next) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} - -#ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - -/* Update list and generate events when modules are unloaded. */ -static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, - void *data) -{ - struct module *mod = data; - struct gcov_info *info; - struct gcov_info *prev; - - if (event != MODULE_STATE_GOING) - return NOTIFY_OK; - mutex_lock(&gcov_lock); - prev = NULL; - /* Remove entries located in module from linked list. */ - for (info = gcov_info_head; info; info = info->next) { - if (within(info, mod->module_core, mod->core_size)) { - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; - if (gcov_events_enabled) - gcov_event(GCOV_REMOVE, info); - } else - prev = info; - } - mutex_unlock(&gcov_lock); - - return NOTIFY_OK; -} - -static struct notifier_block gcov_nb = { - .notifier_call = gcov_module_notifier, -}; - -static int __init gcov_init(void) -{ - return register_module_notifier(&gcov_nb); -} -device_initcall(gcov_init); -#endif /* CONFIG_MODULES */ -/* - * This code exports profiling data as debugfs files to userspace. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter - * - * Uses gcc-internal data definitions. - * Based on the gcov-kernel patch by: - * Hubertus Franke - * Nigel Hinds - * Rajan Ravindran - * Peter Oberparleiter - * Paul Larson - * Yi CDL Yang - */ - -#define pr_fmt(fmt) "gcov: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gcov.h" - -/** - * struct gcov_node - represents a debugfs entry - * @list: list head for child node list - * @children: child nodes - * @all: list head for list of all nodes - * @parent: parent node - * @loaded_info: array of pointers to profiling data sets for loaded object - * files. - * @num_loaded: number of profiling data sets for loaded object files. - * @unloaded_info: accumulated copy of profiling data sets for unloaded - * object files. Used only when gcov_persist=1. - * @dentry: main debugfs entry, either a directory or data file - * @links: associated symbolic links - * @name: data file basename - * - * struct gcov_node represents an entity within the gcov/ subdirectory - * of debugfs. There are directory and data file nodes. The latter represent - * the actual synthesized data file plus any associated symbolic links which - * are needed by the gcov tool to work correctly. - */ -struct gcov_node { - struct list_head list; - struct list_head children; - struct list_head all; - struct gcov_node *parent; - struct gcov_info **loaded_info; - struct gcov_info *unloaded_info; - struct dentry *dentry; - struct dentry **links; - int num_loaded; - char name[0]; -}; - -static const char objtree[] = OBJTREE; -static const char srctree[] = SRCTREE; -static struct gcov_node root_node; -static struct dentry *reset_dentry; -static LIST_HEAD(all_head); -static DEFINE_MUTEX(node_lock); - -/* If non-zero, keep copies of profiling data for unloaded modules. */ -static int gcov_persist = 1; - -static int __init gcov_persist_setup(char *str) -{ - unsigned long val; - - if (strict_strtoul(str, 0, &val)) { - pr_warning("invalid gcov_persist parameter '%s'\n", str); - return 0; - } - gcov_persist = val; - pr_info("setting gcov_persist to %d\n", gcov_persist); - - return 1; -} -__setup("gcov_persist=", gcov_persist_setup); - -/* - * seq_file.start() implementation for gcov data files. Note that the - * gcov_iterator interface is designed to be more restrictive than seq_file - * (no start from arbitrary position, etc.), to simplify the iterator - * implementation. - */ -static void *gcov_seq_start(struct seq_file *seq, loff_t *pos) -{ - loff_t i; - - gcov_iter_start(seq->private); - for (i = 0; i < *pos; i++) { - if (gcov_iter_next(seq->private)) - return NULL; - } - return seq->private; -} - -/* seq_file.next() implementation for gcov data files. */ -static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos) -{ - struct gcov_iterator *iter = data; - - if (gcov_iter_next(iter)) - return NULL; - (*pos)++; - - return iter; -} - -/* seq_file.show() implementation for gcov data files. */ -static int gcov_seq_show(struct seq_file *seq, void *data) -{ - struct gcov_iterator *iter = data; - - if (gcov_iter_write(iter, seq)) - return -EINVAL; - return 0; -} - -static void gcov_seq_stop(struct seq_file *seq, void *data) -{ - /* Unused. */ -} - -static const struct seq_operations gcov_seq_ops = { - .start = gcov_seq_start, - .next = gcov_seq_next, - .show = gcov_seq_show, - .stop = gcov_seq_stop, -}; - -/* - * Return a profiling data set associated with the given node. This is - * either a data set for a loaded object file or a data set copy in case - * all associated object files have been unloaded. - */ -static struct gcov_info *get_node_info(struct gcov_node *node) -{ - if (node->num_loaded > 0) - return node->loaded_info[0]; - - return node->unloaded_info; -} - -/* - * Return a newly allocated profiling data set which contains the sum of - * all profiling data associated with the given node. - */ -static struct gcov_info *get_accumulated_info(struct gcov_node *node) -{ - struct gcov_info *info; - int i = 0; - - if (node->unloaded_info) - info = gcov_info_dup(node->unloaded_info); - else - info = gcov_info_dup(node->loaded_info[i++]); - if (!info) - return NULL; - for (; i < node->num_loaded; i++) - gcov_info_add(info, node->loaded_info[i]); - - return info; -} - -/* - * open() implementation for gcov data files. Create a copy of the profiling - * data set and initialize the iterator and seq_file interface. - */ -static int gcov_seq_open(struct inode *inode, struct file *file) -{ - struct gcov_node *node = inode->i_private; - struct gcov_iterator *iter; - struct seq_file *seq; - struct gcov_info *info; - int rc = -ENOMEM; - - mutex_lock(&node_lock); - /* - * Read from a profiling data copy to minimize reference tracking - * complexity and concurrent access and to keep accumulating multiple - * profiling data sets associated with one node simple. - */ - info = get_accumulated_info(node); - if (!info) - goto out_unlock; - iter = gcov_iter_new(info); - if (!iter) - goto err_free_info; - rc = seq_open(file, &gcov_seq_ops); - if (rc) - goto err_free_iter_info; - seq = file->private_data; - seq->private = iter; -out_unlock: - mutex_unlock(&node_lock); - return rc; - -err_free_iter_info: - gcov_iter_free(iter); -err_free_info: - gcov_info_free(info); - goto out_unlock; -} - -/* - * release() implementation for gcov data files. Release resources allocated - * by open(). - */ -static int gcov_seq_release(struct inode *inode, struct file *file) -{ - struct gcov_iterator *iter; - struct gcov_info *info; - struct seq_file *seq; - - seq = file->private_data; - iter = seq->private; - info = gcov_iter_get_info(iter); - gcov_iter_free(iter); - gcov_info_free(info); - seq_release(inode, file); - - return 0; -} - -/* - * Find a node by the associated data file name. Needs to be called with - * node_lock held. - */ -static struct gcov_node *get_node_by_name(const char *name) -{ - struct gcov_node *node; - struct gcov_info *info; - - list_for_each_entry(node, &all_head, all) { - info = get_node_info(node); - if (info && (strcmp(info->filename, name) == 0)) - return node; - } - - return NULL; -} - -/* - * Reset all profiling data associated with the specified node. - */ -static void reset_node(struct gcov_node *node) -{ - int i; - - if (node->unloaded_info) - gcov_info_reset(node->unloaded_info); - for (i = 0; i < node->num_loaded; i++) - gcov_info_reset(node->loaded_info[i]); -} - -static void remove_node(struct gcov_node *node); - -/* - * write() implementation for gcov data files. Reset profiling data for the - * corresponding file. If all associated object files have been unloaded, - * remove the debug fs node as well. - */ -static ssize_t gcov_seq_write(struct file *file, const char __user *addr, - size_t len, loff_t *pos) -{ - struct seq_file *seq; - struct gcov_info *info; - struct gcov_node *node; - - seq = file->private_data; - info = gcov_iter_get_info(seq->private); - mutex_lock(&node_lock); - node = get_node_by_name(info->filename); - if (node) { - /* Reset counts or remove node for unloaded modules. */ - if (node->num_loaded == 0) - remove_node(node); - else - reset_node(node); - } - /* Reset counts for open file. */ - gcov_info_reset(info); - mutex_unlock(&node_lock); - - return len; -} - -/* - * Given a string representing a file path of format: - * path/to/file.gcda - * construct and return a new string: - * path/to/file. - */ -static char *link_target(const char *dir, const char *path, const char *ext) -{ - char *target; - char *old_ext; - char *copy; - - copy = kstrdup(path, GFP_KERNEL); - if (!copy) - return NULL; - old_ext = strrchr(copy, '.'); - if (old_ext) - *old_ext = '\0'; - if (dir) - target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext); - else - target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext); - kfree(copy); - - return target; -} - -/* - * Construct a string representing the symbolic link target for the given - * gcov data file name and link type. Depending on the link type and the - * location of the data file, the link target can either point to a - * subdirectory of srctree, objtree or in an external location. - */ -static char *get_link_target(const char *filename, const struct gcov_link *ext) -{ - const char *rel; - char *result; - - if (strncmp(filename, objtree, strlen(objtree)) == 0) { - rel = filename + strlen(objtree) + 1; - if (ext->dir == SRC_TREE) - result = link_target(srctree, rel, ext->ext); - else - result = link_target(objtree, rel, ext->ext); - } else { - /* External compilation. */ - result = link_target(NULL, filename, ext->ext); - } - - return result; -} - -#define SKEW_PREFIX ".tmp_" - -/* - * For a filename .tmp_filename.ext return filename.ext. Needed to compensate - * for filename skewing caused by the mod-versioning mechanism. - */ -static const char *deskew(const char *basename) -{ - if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0) - return basename + sizeof(SKEW_PREFIX) - 1; - return basename; -} - -/* - * Create links to additional files (usually .c and .gcno files) which the - * gcov tool expects to find in the same directory as the gcov data file. - */ -static void add_links(struct gcov_node *node, struct dentry *parent) -{ - char *basename; - char *target; - int num; - int i; - - for (num = 0; gcov_link[num].ext; num++) - /* Nothing. */; - node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL); - if (!node->links) - return; - for (i = 0; i < num; i++) { - target = get_link_target(get_node_info(node)->filename, - &gcov_link[i]); - if (!target) - goto out_err; - basename = strrchr(target, '/'); - if (!basename) - goto out_err; - basename++; - node->links[i] = debugfs_create_symlink(deskew(basename), - parent, target); - if (!node->links[i]) - goto out_err; - kfree(target); - } - - return; -out_err: - kfree(target); - while (i-- > 0) - debugfs_remove(node->links[i]); - kfree(node->links); - node->links = NULL; -} - -static const struct file_operations gcov_data_fops = { - .open = gcov_seq_open, - .release = gcov_seq_release, - .read = seq_read, - .llseek = seq_lseek, - .write = gcov_seq_write, -}; - -/* Basic initialization of a new node. */ -static void init_node(struct gcov_node *node, struct gcov_info *info, - const char *name, struct gcov_node *parent) -{ - INIT_LIST_HEAD(&node->list); - INIT_LIST_HEAD(&node->children); - INIT_LIST_HEAD(&node->all); - if (node->loaded_info) { - node->loaded_info[0] = info; - node->num_loaded = 1; - } - node->parent = parent; - if (name) - strcpy(node->name, name); -} - -/* - * Create a new node and associated debugfs entry. Needs to be called with - * node_lock held. - */ -static struct gcov_node *new_node(struct gcov_node *parent, - struct gcov_info *info, const char *name) -{ - struct gcov_node *node; - - node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); - if (!node) - goto err_nomem; - if (info) { - node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), - GFP_KERNEL); - if (!node->loaded_info) - goto err_nomem; - } - init_node(node, info, name, parent); - /* Differentiate between gcov data file nodes and directory nodes. */ - if (info) { - node->dentry = debugfs_create_file(deskew(node->name), 0600, - parent->dentry, node, &gcov_data_fops); - } else - node->dentry = debugfs_create_dir(node->name, parent->dentry); - if (!node->dentry) { - pr_warning("could not create file\n"); - kfree(node); - return NULL; - } - if (info) - add_links(node, parent->dentry); - list_add(&node->list, &parent->children); - list_add(&node->all, &all_head); - - return node; - -err_nomem: - kfree(node); - pr_warning("out of memory\n"); - return NULL; -} - -/* Remove symbolic links associated with node. */ -static void remove_links(struct gcov_node *node) -{ - int i; - - if (!node->links) - return; - for (i = 0; gcov_link[i].ext; i++) - debugfs_remove(node->links[i]); - kfree(node->links); - node->links = NULL; -} - -/* - * Remove node from all lists and debugfs and release associated resources. - * Needs to be called with node_lock held. - */ -static void release_node(struct gcov_node *node) -{ - list_del(&node->list); - list_del(&node->all); - debugfs_remove(node->dentry); - remove_links(node); - kfree(node->loaded_info); - if (node->unloaded_info) - gcov_info_free(node->unloaded_info); - kfree(node); -} - -/* Release node and empty parents. Needs to be called with node_lock held. */ -static void remove_node(struct gcov_node *node) -{ - struct gcov_node *parent; - - while ((node != &root_node) && list_empty(&node->children)) { - parent = node->parent; - release_node(node); - node = parent; - } -} - -/* - * Find child node with given basename. Needs to be called with node_lock - * held. - */ -static struct gcov_node *get_child_by_name(struct gcov_node *parent, - const char *name) -{ - struct gcov_node *node; - - list_for_each_entry(node, &parent->children, list) { - if (strcmp(node->name, name) == 0) - return node; - } - - return NULL; -} - -/* - * write() implementation for reset file. Reset all profiling data to zero - * and remove nodes for which all associated object files are unloaded. - */ -static ssize_t reset_write(struct file *file, const char __user *addr, - size_t len, loff_t *pos) -{ - struct gcov_node *node; - - mutex_lock(&node_lock); -restart: - list_for_each_entry(node, &all_head, all) { - if (node->num_loaded > 0) - reset_node(node); - else if (list_empty(&node->children)) { - remove_node(node); - /* Several nodes may have gone - restart loop. */ - goto restart; - } - } - mutex_unlock(&node_lock); - - return len; -} - -/* read() implementation for reset file. Unused. */ -static ssize_t reset_read(struct file *file, char __user *addr, size_t len, - loff_t *pos) -{ - /* Allow read operation so that a recursive copy won't fail. */ - return 0; -} - -static const struct file_operations gcov_reset_fops = { - .write = reset_write, - .read = reset_read, - .llseek = noop_llseek, -}; - -/* - * Create a node for a given profiling data set and add it to all lists and - * debugfs. Needs to be called with node_lock held. - */ -static void add_node(struct gcov_info *info) -{ - char *filename; - char *curr; - char *next; - struct gcov_node *parent; - struct gcov_node *node; - - filename = kstrdup(info->filename, GFP_KERNEL); - if (!filename) - return; - parent = &root_node; - /* Create directory nodes along the path. */ - for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) { - if (curr == next) - continue; - *next = 0; - if (strcmp(curr, ".") == 0) - continue; - if (strcmp(curr, "..") == 0) { - if (!parent->parent) - goto err_remove; - parent = parent->parent; - continue; - } - node = get_child_by_name(parent, curr); - if (!node) { - node = new_node(parent, NULL, curr); - if (!node) - goto err_remove; - } - parent = node; - } - /* Create file node. */ - node = new_node(parent, info, curr); - if (!node) - goto err_remove; -out: - kfree(filename); - return; - -err_remove: - remove_node(parent); - goto out; -} - -/* - * Associate a profiling data set with an existing node. Needs to be called - * with node_lock held. - */ -static void add_info(struct gcov_node *node, struct gcov_info *info) -{ - struct gcov_info **loaded_info; - int num = node->num_loaded; - - /* - * Prepare new array. This is done first to simplify cleanup in - * case the new data set is incompatible, the node only contains - * unloaded data sets and there's not enough memory for the array. - */ - loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); - if (!loaded_info) { - pr_warning("could not add '%s' (out of memory)\n", - info->filename); - return; - } - memcpy(loaded_info, node->loaded_info, - num * sizeof(struct gcov_info *)); - loaded_info[num] = info; - /* Check if the new data set is compatible. */ - if (num == 0) { - /* - * A module was unloaded, modified and reloaded. The new - * data set replaces the copy of the last one. - */ - if (!gcov_info_is_compatible(node->unloaded_info, info)) { - pr_warning("discarding saved data for %s " - "(incompatible version)\n", info->filename); - gcov_info_free(node->unloaded_info); - node->unloaded_info = NULL; - } - } else { - /* - * Two different versions of the same object file are loaded. - * The initial one takes precedence. - */ - if (!gcov_info_is_compatible(node->loaded_info[0], info)) { - pr_warning("could not add '%s' (incompatible " - "version)\n", info->filename); - kfree(loaded_info); - return; - } - } - /* Overwrite previous array. */ - kfree(node->loaded_info); - node->loaded_info = loaded_info; - node->num_loaded = num + 1; -} - -/* - * Return the index of a profiling data set associated with a node. - */ -static int get_info_index(struct gcov_node *node, struct gcov_info *info) -{ - int i; - - for (i = 0; i < node->num_loaded; i++) { - if (node->loaded_info[i] == info) - return i; - } - return -ENOENT; -} - -/* - * Save the data of a profiling data set which is being unloaded. - */ -static void save_info(struct gcov_node *node, struct gcov_info *info) -{ - if (node->unloaded_info) - gcov_info_add(node->unloaded_info, info); - else { - node->unloaded_info = gcov_info_dup(info); - if (!node->unloaded_info) { - pr_warning("could not save data for '%s' " - "(out of memory)\n", info->filename); - } - } -} - -/* - * Disassociate a profiling data set from a node. Needs to be called with - * node_lock held. - */ -static void remove_info(struct gcov_node *node, struct gcov_info *info) -{ - int i; - - i = get_info_index(node, info); - if (i < 0) { - pr_warning("could not remove '%s' (not found)\n", - info->filename); - return; - } - if (gcov_persist) - save_info(node, info); - /* Shrink array. */ - node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; - node->num_loaded--; - if (node->num_loaded > 0) - return; - /* Last loaded data set was removed. */ - kfree(node->loaded_info); - node->loaded_info = NULL; - node->num_loaded = 0; - if (!node->unloaded_info) - remove_node(node); -} - -/* - * Callback to create/remove profiling files when code compiled with - * -fprofile-arcs is loaded/unloaded. - */ -void gcov_event(enum gcov_action action, struct gcov_info *info) -{ - struct gcov_node *node; - - mutex_lock(&node_lock); - node = get_node_by_name(info->filename); - switch (action) { - case GCOV_ADD: - if (node) - add_info(node, info); - else - add_node(info); - break; - case GCOV_REMOVE: - if (node) - remove_info(node, info); - else { - pr_warning("could not remove '%s' (not found)\n", - info->filename); - } - break; - } - mutex_unlock(&node_lock); -} - -/* Create debugfs entries. */ -static __init int gcov_fs_init(void) -{ - int rc = -EIO; - - init_node(&root_node, NULL, NULL, NULL); - /* - * /sys/kernel/debug/gcov will be parent for the reset control file - * and all profiling files. - */ - root_node.dentry = debugfs_create_dir("gcov", NULL); - if (!root_node.dentry) - goto err_remove; - /* - * Create reset file which resets all profiling counts when written - * to. - */ - reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, - NULL, &gcov_reset_fops); - if (!reset_dentry) - goto err_remove; - /* Replay previous events to get our fs hierarchy up-to-date. */ - gcov_enable_events(); - return 0; - -err_remove: - pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); - - return rc; -} -device_initcall(gcov_fs_init); -/* - * This code provides functions to handle gcc's profiling data format - * introduced with gcc 3.4. Future versions of gcc may change the gcov - * format (as happened before), so all format-specific information needs - * to be kept modular and easily exchangeable. - * - * This file is based on gcc-internal definitions. Functions and data - * structures are defined to be compatible with gcc counterparts. - * For a better understanding, refer to gcc source: gcc/gcov-io.h. - * - * Copyright IBM Corp. 2009 - * Author(s): Peter Oberparleiter - * - * Uses gcc-internal data definitions. - */ - -#include -#include -#include -#include -#include -#include "gcov.h" - -/* Symbolic links to be created for each profiling data file. */ -const struct gcov_link gcov_link[] = { - { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ - { 0, NULL}, -}; - -/* - * Determine whether a counter is active. Based on gcc magic. Doesn't change - * at run-time. - */ -static int counter_active(struct gcov_info *info, unsigned int type) -{ - return (1 << type) & info->ctr_mask; -} - -/* Determine number of active counters. Based on gcc magic. */ -static unsigned int num_counter_active(struct gcov_info *info) -{ - unsigned int i; - unsigned int result = 0; - - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(info, i)) - result++; - } - return result; -} - -/** - * gcov_info_reset - reset profiling data to zero - * @info: profiling data set - */ -void gcov_info_reset(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active; i++) { - memset(info->counts[i].values, 0, - info->counts[i].num * sizeof(gcov_type)); - } -} - -/** - * gcov_info_is_compatible - check if profiling data can be added - * @info1: first profiling data set - * @info2: second profiling data set - * - * Returns non-zero if profiling data can be added, zero otherwise. - */ -int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) -{ - return (info1->stamp == info2->stamp); -} - -/** - * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added - * - * Adds profiling counts of @source to @dest. - */ -void gcov_info_add(struct gcov_info *dest, struct gcov_info *source) -{ - unsigned int i; - unsigned int j; - - for (i = 0; i < num_counter_active(dest); i++) { - for (j = 0; j < dest->counts[i].num; j++) { - dest->counts[i].values[j] += - source->counts[i].values[j]; - } - } -} - -/* Get size of function info entry. Based on gcc magic. */ -static size_t get_fn_size(struct gcov_info *info) -{ - size_t size; - - size = sizeof(struct gcov_fn_info) + num_counter_active(info) * - sizeof(unsigned int); - if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int)) - size = ALIGN(size, __alignof__(struct gcov_fn_info)); - return size; -} - -/* Get address of function info entry. Based on gcc magic. */ -static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn) -{ - return (struct gcov_fn_info *) - ((char *) info->functions + fn * get_fn_size(info)); -} - -/** - * gcov_info_dup - duplicate profiling data set - * @info: profiling data set to duplicate - * - * Return newly allocated duplicate on success, %NULL on error. - */ -struct gcov_info *gcov_info_dup(struct gcov_info *info) -{ - struct gcov_info *dup; - unsigned int i; - unsigned int active; - - /* Duplicate gcov_info. */ - active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); - if (!dup) - return NULL; - dup->version = info->version; - dup->stamp = info->stamp; - dup->n_functions = info->n_functions; - dup->ctr_mask = info->ctr_mask; - /* Duplicate filename. */ - dup->filename = kstrdup(info->filename, GFP_KERNEL); - if (!dup->filename) - goto err_free; - /* Duplicate table of functions. */ - dup->functions = kmemdup(info->functions, info->n_functions * - get_fn_size(info), GFP_KERNEL); - if (!dup->functions) - goto err_free; - /* Duplicate counter arrays. */ - for (i = 0; i < active ; i++) { - struct gcov_ctr_info *ctr = &info->counts[i]; - size_t size = ctr->num * sizeof(gcov_type); - - dup->counts[i].num = ctr->num; - dup->counts[i].merge = ctr->merge; - dup->counts[i].values = vmalloc(size); - if (!dup->counts[i].values) - goto err_free; - memcpy(dup->counts[i].values, ctr->values, size); - } - return dup; - -err_free: - gcov_info_free(dup); - return NULL; -} - -/** - * gcov_info_free - release memory for profiling data set duplicate - * @info: profiling data set duplicate to free - */ -void gcov_info_free(struct gcov_info *info) -{ - unsigned int active = num_counter_active(info); - unsigned int i; - - for (i = 0; i < active ; i++) - vfree(info->counts[i].values); - kfree(info->functions); - kfree(info->filename); - kfree(info); -} - -/** - * struct type_info - iterator helper array - * @ctr_type: counter type - * @offset: index of the first value of the current function for this type - * - * This array is needed to convert the in-memory data format into the in-file - * data format: - * - * In-memory: - * for each counter type - * for each function - * values - * - * In-file: - * for each function - * for each counter type - * values - * - * See gcc source gcc/gcov-io.h for more information on data organization. - */ -struct type_info { - int ctr_type; - unsigned int offset; -}; - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @record: record type - * @function: function number - * @type: counter type - * @count: index into values array - * @num_types: number of counter types - * @type_info: helper array to get values-array offset for current function - */ -struct gcov_iterator { - struct gcov_info *info; - - int record; - unsigned int function; - unsigned int type; - unsigned int count; - - int num_types; - struct type_info type_info[0]; -}; - -static struct gcov_fn_info *get_func(struct gcov_iterator *iter) -{ - return get_fn_info(iter->info, iter->function); -} - -static struct type_info *get_type(struct gcov_iterator *iter) -{ - return &iter->type_info[iter->type]; -} - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), - GFP_KERNEL); - if (iter) - iter->info = info; - - return iter; -} - -/** - * gcov_iter_free - release memory for iterator - * @iter: file iterator to free - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - int i; - - iter->record = 0; - iter->function = 0; - iter->type = 0; - iter->count = 0; - iter->num_types = 0; - for (i = 0; i < GCOV_COUNTERS; i++) { - if (counter_active(iter->info, i)) { - iter->type_info[iter->num_types].ctr_type = i; - iter->type_info[iter->num_types++].offset = 0; - } - } -} - -/* Mapping of logical record number to actual file content. */ -#define RECORD_FILE_MAGIC 0 -#define RECORD_GCOV_VERSION 1 -#define RECORD_TIME_STAMP 2 -#define RECORD_FUNCTION_TAG 3 -#define RECORD_FUNCTON_TAG_LEN 4 -#define RECORD_FUNCTION_IDENT 5 -#define RECORD_FUNCTION_CHECK 6 -#define RECORD_COUNT_TAG 7 -#define RECORD_COUNT_LEN 8 -#define RECORD_COUNT 9 - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - switch (iter->record) { - case RECORD_FILE_MAGIC: - case RECORD_GCOV_VERSION: - case RECORD_FUNCTION_TAG: - case RECORD_FUNCTON_TAG_LEN: - case RECORD_FUNCTION_IDENT: - case RECORD_COUNT_TAG: - /* Advance to next record */ - iter->record++; - break; - case RECORD_COUNT: - /* Advance to next count */ - iter->count++; - /* fall through */ - case RECORD_COUNT_LEN: - if (iter->count < get_func(iter)->n_ctrs[iter->type]) { - iter->record = 9; - break; - } - /* Advance to next counter type */ - get_type(iter)->offset += iter->count; - iter->count = 0; - iter->type++; - /* fall through */ - case RECORD_FUNCTION_CHECK: - if (iter->type < iter->num_types) { - iter->record = 7; - break; - } - /* Advance to next function */ - iter->type = 0; - iter->function++; - /* fall through */ - case RECORD_TIME_STAMP: - if (iter->function < iter->info->n_functions) - iter->record = 3; - else - iter->record = -1; - break; - } - /* Check for EOF. */ - if (iter->record == -1) - return -EINVAL; - else - return 0; -} - -/** - * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. - */ -static int seq_write_gcov_u32(struct seq_file *seq, u32 v) -{ - return seq_write(seq, &v, sizeof(v)); -} - -/** - * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file - * @seq: seq_file handle - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. - */ -static int seq_write_gcov_u64(struct seq_file *seq, u64 v) -{ - u32 data[2]; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - return seq_write(seq, data, sizeof(data)); -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - int rc = -EINVAL; - - switch (iter->record) { - case RECORD_FILE_MAGIC: - rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC); - break; - case RECORD_GCOV_VERSION: - rc = seq_write_gcov_u32(seq, iter->info->version); - break; - case RECORD_TIME_STAMP: - rc = seq_write_gcov_u32(seq, iter->info->stamp); - break; - case RECORD_FUNCTION_TAG: - rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); - break; - case RECORD_FUNCTON_TAG_LEN: - rc = seq_write_gcov_u32(seq, 2); - break; - case RECORD_FUNCTION_IDENT: - rc = seq_write_gcov_u32(seq, get_func(iter)->ident); - break; - case RECORD_FUNCTION_CHECK: - rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); - break; - case RECORD_COUNT_TAG: - rc = seq_write_gcov_u32(seq, - GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type)); - break; - case RECORD_COUNT_LEN: - rc = seq_write_gcov_u32(seq, - get_func(iter)->n_ctrs[iter->type] * 2); - break; - case RECORD_COUNT: - rc = seq_write_gcov_u64(seq, - iter->info->counts[iter->type]. - values[iter->count + get_type(iter)->offset]); - break; - } - return rc; -} -/* - * Supplementary group IDs - */ -#include -#include -#include -#include -#include -#include - -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - -struct group_info *groups_alloc(int gidsetsize) -{ - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) - return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - gid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; - -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; -} - -EXPORT_SYMBOL(groups_alloc); - -void groups_free(struct group_info *group_info) -{ - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); -} - -EXPORT_SYMBOL(groups_free); - -/* export the group_info to a user-space array */ -static int groups_to_user(gid_t __user *grouplist, - const struct group_info *group_info) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* fill a group_info from a user-space array - it must be allocated already */ -static int groups_from_user(struct group_info *group_info, - gid_t __user *grouplist) -{ - int i; - unsigned int count = group_info->ngroups; - - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_from_user(group_info->blocks[i], grouplist, len)) - return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; - } - return 0; -} - -/* a simple Shell sort */ -static void groups_sort(struct group_info *group_info) -{ - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); - - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); - right = left; - left -= stride; - } - GROUP_AT(group_info, right) = tmp; - } - stride /= 3; - } -} - -/* a simple bsearch */ -int groups_search(const struct group_info *group_info, gid_t grp) -{ - unsigned int left, right; - - if (!group_info) - return 0; - - left = 0; - right = group_info->ngroups; - while (left < right) { - unsigned int mid = (left+right)/2; - if (grp > GROUP_AT(group_info, mid)) - left = mid + 1; - else if (grp < GROUP_AT(group_info, mid)) - right = mid; - else - return 1; - } - return 0; -} - -/** - * set_groups - Change a group subscription in a set of credentials - * @new: The newly prepared set of credentials to alter - * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. - */ -int set_groups(struct cred *new, struct group_info *group_info) -{ - put_group_info(new->group_info); - groups_sort(group_info); - get_group_info(group_info); - new->group_info = group_info; - return 0; -} - -EXPORT_SYMBOL(set_groups); - -/** - * set_current_groups - Change current's group subscription - * @group_info: The group list to impose - * - * Validate a group subscription and, if valid, impose it upon current's task - * security record. - */ -int set_current_groups(struct group_info *group_info) -{ - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); -} - -EXPORT_SYMBOL(set_current_groups); - -SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - const struct cred *cred = current_cred(); - int i; - - if (gidsetsize < 0) - return -EINVAL; - - /* no need to grab task_lock here; it cannot change */ - i = cred->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups_to_user(grouplist, cred->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - return i; -} - -/* - * SMP: Our groups are copy-on-write. We can set them safely - * without another task interfering. - */ - -SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) -{ - struct group_info *group_info; - int retval; - - if (!nsown_capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } - - retval = set_current_groups(group_info); - put_group_info(group_info); - - return retval; -} - -/* - * Check whether we're fsgid/egid or in the supplemental group.. - */ -int in_group_p(gid_t grp) -{ - const struct cred *cred = current_cred(); - int retval = 1; - - if (grp != cred->fsgid) - retval = groups_search(cred->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_group_p); - -int in_egroup_p(gid_t grp) -{ - const struct cred *cred = current_cred(); - int retval = 1; - - if (grp != cred->egid) - retval = groups_search(cred->group_info, grp); - return retval; -} - -EXPORT_SYMBOL(in_egroup_p); -/* - * linux/kernel/hrtimer.c - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner - * - * High-resolution kernel timers - * - * In contrast to the low-resolution timeout API implemented in - * kernel/timer.c, hrtimers provide finer resolution and accuracy - * depending on system configuration and capabilities. - * - * These timers are currently used for: - * - itimers - * - POSIX timers - * - nanosleep - * - precise in-kernel timing - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * Credits: - * based on kernel/timer.c - * - * Help, testing, suggestions, bugfixes, improvements were - * provided by: - * - * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel - * et. al. - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -/* - * The timer bases: - * - * There are more clockids then hrtimer bases. Thus, we index - * into the timer bases by the hrtimer_base_type enum. When trying - * to reach a base using a clockid, hrtimer_clockid_to_base() - * is used to convert from clockid to the proper hrtimer_base_type. - */ -DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = -{ - - .clock_base = - { - { - .index = HRTIMER_BASE_MONOTONIC, - .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, - }, - { - .index = HRTIMER_BASE_REALTIME, - .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, - }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, - }, - } -}; - -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, -}; - -static inline int hrtimer_clockid_to_base(clockid_t clock_id) -{ - return hrtimer_clock_to_base_table[clock_id]; -} - - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot; - struct timespec xts, tom, slp; - - get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); - - xtim = timespec_to_ktime(xts); - mono = ktime_add(xtim, timespec_to_ktime(tom)); - boot = ktime_add(mono, timespec_to_ktime(slp)); - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; -} - -/* - * Functions and macros which are different for UP/SMP systems are kept in a - * single place - */ -#ifdef CONFIG_SMP - -/* - * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on the lists/queues. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) -{ - struct hrtimer_clock_base *base; - - for (;;) { - base = timer->base; - if (likely(base != NULL)) { - raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - if (likely(base == timer->base)) - return base; - /* The timer has migrated to another CPU: */ - raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); - } - cpu_relax(); - } -} - - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - -/* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. - * - * Called with cpu_base->lock of target cpu held. - */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - ktime_t expires; - - if (!new_base->cpu_base->hres_active) - return 0; - - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires.tv64 <= new_base->cpu_base->expires_next.tv64; -#else - return 0; -#endif -} - -/* - * Switch the timer base to the current CPU when possible. - */ -static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, - int pinned) -{ - struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); - int basenum = base->index; - -again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[basenum]; - - if (base != new_base) { - /* - * We are trying to move timer to new_base. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. - */ - if (unlikely(hrtimer_callback_running(timer))) - return base; - - /* See the comment in lock_timer_base() */ - timer->base = NULL; - raw_spin_unlock(&base->cpu_base->lock); - raw_spin_lock(&new_base->cpu_base->lock); - - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; - raw_spin_unlock(&new_base->cpu_base->lock); - raw_spin_lock(&base->cpu_base->lock); - timer->base = base; - goto again; - } - timer->base = new_base; - } - return new_base; -} - -#else /* CONFIG_SMP */ - -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - struct hrtimer_clock_base *base = timer->base; - - raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - - return base; -} - -# define switch_hrtimer_base(t, b, p) (b) - -#endif /* !CONFIG_SMP */ - -/* - * Functions for the union type storage format of ktime_t which are - * too large for inlining: - */ -#if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt: addend - * @nsec: the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt: minuend - * @nsec: the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ - ktime_t tmp; - - if (likely(nsec < NSEC_PER_SEC)) { - tmp.tv64 = nsec; - } else { - unsigned long rem = do_div(nsec, NSEC_PER_SEC); - - tmp = ktime_set((long)nsec, rem); - } - - return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ - -/* - * Divide a ktime value by a nanosecond value - */ -u64 ktime_divns(const ktime_t kt, s64 div) -{ - u64 dclc; - int sft = 0; - - dclc = ktime_to_ns(kt); - /* Make sure the divisor is less than 2^32: */ - while (div >> 32) { - sft++; - div >>= 1; - } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; -} -#endif /* BITS_PER_LONG >= 64 */ - -/* - * Add two ktime values and do a safety check for overflow: - */ -ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) -{ - ktime_t res = ktime_add(lhs, rhs); - - /* - * We use KTIME_SEC_MAX here, the maximum timeout which we can - * return to user space in a timespec: - */ - if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) - res = ktime_set(KTIME_SEC_MAX, 0); - - return res; -} - -EXPORT_SYMBOL_GPL(ktime_add_safe); - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr hrtimer_debug_descr; - -static void *hrtimer_debug_hint(void *addr) -{ - return ((struct hrtimer *) addr)->function; -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_init(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) -{ - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct hrtimer *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - hrtimer_cancel(timer); - debug_object_free(timer, &hrtimer_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .debug_hint = hrtimer_debug_hint, - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, -}; - -static inline void debug_hrtimer_init(struct hrtimer *timer) -{ - debug_object_init(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_activate(struct hrtimer *timer) -{ - debug_object_activate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) -{ - debug_object_deactivate(timer, &hrtimer_debug_descr); -} - -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode); - -void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_object_init_on_stack(timer, &hrtimer_debug_descr); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); - -void destroy_hrtimer_on_stack(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - -#else -static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } -static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } -#endif - -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) -{ - debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); -} - -static inline void debug_activate(struct hrtimer *timer) -{ - debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); -} - -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); -} - -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static int hrtimer_hres_enabled __read_mostly = 1; - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; -} - -__setup("highres=", setup_hrtimer_hres); - -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; -} - -/* - * Is the high resolution mode active ? - */ -static inline int hrtimer_hres_active(void) -{ - return __this_cpu_read(hrtimer_bases.hres_active); -} - -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) -{ - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } - - if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) - return; - - cpu_base->expires_next.tv64 = expires_next.tv64; - - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); -} - -/* - * Shared reprogramming for clock_realtime and clock_monotonic - * - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held - */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; - - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); - - /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. - */ - if (hrtimer_callback_running(timer)) - return 0; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. - */ - if (expires.tv64 < 0) - return -ETIME; - - if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; - - /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (cpu_base->hang_detected) - return 0; - - /* - * Clockevents returns -ETIME, when the event was in the past. - */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; -} - -/* - * Initialize the high resolution related parts of cpu_base - */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) -{ - base->expires_next.tv64 = KTIME_MAX; - base->hres_active = 0; -} - -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) -{ - if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - if (wakeup) { - raw_spin_unlock(&base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - raw_spin_lock(&base->cpu_base->lock); - } else - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - - return 1; - } - - return 0; -} - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; - - if (!hrtimer_hres_active()) - return; - - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ - raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} - -/* - * Switch to high resolution mode - */ -static int hrtimer_switch_to_hres(void) -{ - int i, cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); - - if (tick_init_highres()) { - local_irq_restore(flags); - printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); - return 0; - } - base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; - - tick_setup_sched_timer(); - - /* "Retrigger" the interrupt to get things going */ - retrigger_next_event(NULL); - local_irq_restore(flags); - return 1; -} - -#else - -static inline int hrtimer_hres_active(void) { return 0; } -static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void retrigger_next_event(void *arg) { } - -#endif /* CONFIG_HIGH_RES_TIMERS */ - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -#endif - timerfd_clock_was_set(); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hrtimers_resume(void) -{ - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - - retrigger_next_event(NULL); - timerfd_clock_was_set(); -} - -static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (timer->start_site) - return; - timer->start_site = __builtin_return_address(0); - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -#endif -} - -static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; -#endif -} - -static inline void timer_stats_account_hrtimer(struct hrtimer *timer) -{ -#ifdef CONFIG_TIMER_STATS - if (likely(!timer_stats_active)) - return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, 0); -#endif -} - -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) -{ - raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); -} - -/** - * hrtimer_forward - forward the timer expiry - * @timer: hrtimer to forward - * @now: forward past this time - * @interval: the interval to forward - * - * Forward the timer expiry so it will expire in the future. - * Returns the number of overruns. - */ -u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) -{ - u64 orun = 1; - ktime_t delta; - - delta = ktime_sub(now, hrtimer_get_expires(timer)); - - if (delta.tv64 < 0) - return 0; - - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; - - if (unlikely(delta.tv64 >= interval.tv64)) { - s64 incr = ktime_to_ns(interval); - - orun = ktime_divns(delta, incr); - hrtimer_add_expires_ns(timer, incr * orun); - if (hrtimer_get_expires_tv64(timer) > now.tv64) - return orun; - /* - * This (and the ktime_add() below) is the - * correction for exact: - */ - orun++; - } - hrtimer_add_expires(timer, interval); - - return orun; -} -EXPORT_SYMBOL_GPL(hrtimer_forward); - -/* - * enqueue_hrtimer - internal function to (re)start a timer - * - * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. - * - * Returns 1 when the new timer is the leftmost timer in the tree. - */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - debug_activate(timer); - - timerqueue_add(&base->active, &timer->node); - base->cpu_base->active_bases |= 1 << base->index; - - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; - - return (&timer->node == base->active.next); -} - -/* - * __remove_hrtimer - internal function to remove a timer - * - * Caller must hold the base lock. - * - * High resolution timer mode reprograms the clock event device when the - * timer is the one which expires next. The caller can disable this by setting - * reprogram to zero. This is useful, when the context does a reprogramming - * anyway (e.g. timer interrupt) - */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) -{ - struct timerqueue_node *next_timer; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; - - next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (&timer->node == next_timer) { -#ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); - } -#endif - } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); -out: - timer->state = newstate; -} - -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) -{ - if (hrtimer_is_queued(timer)) { - unsigned long state; - int reprogram; - - /* - * Remove the timer and force reprogramming when high - * resolution mode is active and the timer is on the current - * CPU. If we remove a timer on another CPU, reprogramming is - * skipped. The interrupt event on this CPU is fired and - * reprogramming happens in the interrupt handler. This is a - * rare case and less expensive than a smp call. - */ - debug_deactivate(timer); - timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; - __remove_hrtimer(timer, base, state, reprogram); - return 1; - } - return 0; -} - -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) -{ - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int ret, leftmost; - - base = lock_hrtimer_base(timer, &flags); - - /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); - - /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - - if (mode & HRTIMER_MODE_REL) { - tim = ktime_add_safe(tim, new_base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); -#endif - } - - hrtimer_set_expires_range_ns(timer, tim, delta_ns); - - timer_stats_hrtimer_set_start_info(timer); - - leftmost = enqueue_hrtimer(timer, new_base); - - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) - hrtimer_enqueue_reprogram(timer, new_base, wakeup); - - unlock_hrtimer_base(timer, &flags); - - return ret; -} - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); - -/** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - -/** - * hrtimer_try_to_cancel - try to deactivate a timer - * @timer: hrtimer to stop - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently excuting the callback function and - * cannot be stopped - */ -int hrtimer_try_to_cancel(struct hrtimer *timer) -{ - struct hrtimer_clock_base *base; - unsigned long flags; - int ret = -1; - - base = lock_hrtimer_base(timer, &flags); - - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); - - unlock_hrtimer_base(timer, &flags); - - return ret; - -} -EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); - -/** - * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * @timer: the timer to be cancelled - * - * Returns: - * 0 when the timer was not active - * 1 when the timer was active - */ -int hrtimer_cancel(struct hrtimer *timer) -{ - for (;;) { - int ret = hrtimer_try_to_cancel(timer); - - if (ret >= 0) - return ret; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(hrtimer_cancel); - -/** - * hrtimer_get_remaining - get remaining time for the timer - * @timer: the timer to read - */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) -{ - unsigned long flags; - ktime_t rem; - - lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); - unlock_hrtimer_base(timer, &flags); - - return rem; -} -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); - -#ifdef CONFIG_NO_HZ -/** - * hrtimer_get_next_event - get the time until next expiry event - * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. - */ -ktime_t hrtimer_get_next_event(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); - - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } - - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; -} -#endif - -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - struct hrtimer_cpu_base *cpu_base; - int base; - - memset(timer, 0, sizeof(struct hrtimer)); - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) - clock_id = CLOCK_MONOTONIC; - - base = hrtimer_clockid_to_base(clock_id); - timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); - -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif -} - -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: timer mode abs/rel - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); - - cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); - - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) -{ - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; - enum hrtimer_restart (*fn)(struct hrtimer *); - int restart; - - WARN_ON(!irqs_disabled()); - - debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); - timer_stats_account_hrtimer(timer); - fn = timer->function; - - /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. - */ - raw_spin_unlock(&cpu_base->lock); - trace_hrtimer_expire_entry(timer, now); - restart = fn(timer); - trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); - - /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in - * hrtimer_start_range_ns() or in hrtimer_interrupt() - */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base); - } - - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); - - timer->state &= ~HRTIMER_STATE_CALLBACK; -} - -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - entry_time = now = ktime_get(); -retry: - expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; - struct timerqueue_node *node; - ktime_t basenow; - - if (!(cpu_base->active_bases & (1 << i))) - continue; - - base = cpu_base->clock_base + i; - basenow = ktime_add(now, base->offset); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - - /* - * The immediate goal for using the softexpires is - * minimizing wakeups, not running timers at the - * earliest interrupt after their soft expiration. - * This allows us to avoid using a Priority Search - * Tree, which can answer a stabbing querry for - * overlapping intervals and instead use the simple - * BST we already have. - * We don't add extra wakeups by delaying timers that - * are right-of a not yet expired timer, because that - * timer will have to trigger a wakeup anyway. - */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - break; - } - - __run_hrtimer(timer, &basenow); - } - } - - /* - * Store the new expiry value so the migration code can verify - * against it. - */ - cpu_base->expires_next = expires_next; - raw_spin_unlock(&cpu_base->lock); - - /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; - return; - } - - /* - * The next timer was already expired due to: - * - tracing - * - long lasting callbacks - * - being scheduled away when running in a VM - * - * We need to prevent that we loop forever in the hrtimer - * interrupt routine. We give it 3 attempts to avoid - * overreacting on some spurious event. - */ - now = ktime_get(); - cpu_base->nr_retries++; - if (++retries < 3) - goto retry; - /* - * Give the system a chance to do something else than looping - * here. We stored the entry time, so we know exactly how long - * we spent here. We schedule the next event this amount of - * time away. - */ - cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; - delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; - /* - * Limit it to a sensible value as we enforce a longer - * delay. Give the CPU at least 100ms to catch up. - */ - if (delta.tv64 > 100 * NSEC_PER_MSEC) - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); - else - expires_next = ktime_add(now, delta); - tick_program_event(expires_next, 1); - printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", - ktime_to_ns(delta)); -} - -/* - * local version of hrtimer_peek_ahead_timers() called with interrupts - * disabled. - */ -static void __hrtimer_peek_ahead_timers(void) -{ - struct tick_device *td; - - if (!hrtimer_hres_active()) - return; - - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); -} - -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - -#else /* CONFIG_HIGH_RES_TIMERS */ - -static inline void __hrtimer_peek_ahead_timers(void) { } - -#endif /* !CONFIG_HIGH_RES_TIMERS */ - -/* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; - - if (hrtimer_hres_active()) - return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); - } -} - -/* - * Sleep related functions: - */ -static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) -{ - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); - struct task_struct *task = t->task; - - t->task = NULL; - if (task) - wake_up_process(task); - - return HRTIMER_NORESTART; -} - -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) -{ - sl->timer.function = hrtimer_wakeup; - sl->task = task; -} -EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); - -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) -{ - hrtimer_init_sleeper(t, current); - - do { - set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; - - if (likely(t->task)) - schedule(); - - hrtimer_cancel(&t->timer); - mode = HRTIMER_MODE_ABS; - - } while (t->task && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = hrtimer_expires_remaining(timer); - if (rem.tv64 <= 0) - return 0; - rmt = ktime_to_timespec(rem); - - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; -} - -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) -{ - struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; - - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, - const enum hrtimer_mode mode, const clockid_t clockid) -{ - struct restart_block *restart; - struct hrtimer_sleeper t; - int ret = 0; - unsigned long slack; - - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; - - hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode)) - goto out; - - /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_MODE_ABS) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - restart = ¤t_thread_info()->restart_block; - restart->fn = hrtimer_nanosleep_restart; - restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.rmtp = rmtp; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); - - ret = -ERESTART_RESTARTBLOCK; -out: - destroy_hrtimer_on_stack(&t.timer); - return ret; -} - -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct timespec tu; - - if (copy_from_user(&tu, rqtp, sizeof(tu))) - return -EFAULT; - - if (!timespec_valid(&tu)) - return -EINVAL; - - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); -} - -/* - * Functions related to boot-time initialization: - */ -static void __cpuinit init_hrtimers_cpu(int cpu) -{ - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - - raw_spin_lock_init(&cpu_base->lock); - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); - } - - hrtimer_init_hres(cpu_base); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) -{ - struct hrtimer *timer; - struct timerqueue_node *node; - - while ((node = timerqueue_getnext(&old_base->active))) { - timer = container_of(node, struct hrtimer, node); - BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); - - /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the - * timer could be seen as !active and just vanish away - * under us on another CPU - */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); - timer->base = new_base; - /* - * Enqueue the timers on the new cpu. This does not - * reprogram the event device in case the timer - * expires before the earliest on this CPU, but we run - * hrtimer_interrupt after we migrated everything to - * sort out already expired timers and reprogram the - * event device. - */ - enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; - } -} - -static void migrate_hrtimers(int scpu) -{ - struct hrtimer_cpu_base *old_base, *new_base; - int i; - - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); - - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); - } - - raw_spin_unlock(&old_base->lock); - raw_spin_unlock(&new_base->lock); - - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int scpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(scpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - case CPU_DYING_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - { - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - migrate_hrtimers(scpu); - break; - } -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - -void __init hrtimers_init(void) -{ - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif -} - -/** - * schedule_hrtimeout_range_clock - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME - */ -int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode, int clock) -{ - struct hrtimer_sleeper t; - - /* - * Optimize when a zero timeout value is given. It does not - * matter whether this is an absolute or a relative time. - */ - if (expires && !expires->tv64) { - __set_current_state(TASK_RUNNING); - return 0; - } - - /* - * A NULL parameter means "infinite" - */ - if (!expires) { - schedule(); - __set_current_state(TASK_RUNNING); - return -EINTR; - } - - hrtimer_init_on_stack(&t.timer, clock, mode); - hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - - hrtimer_init_sleeper(&t, current); - - hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - destroy_hrtimer_on_stack(&t.timer); - - __set_current_state(TASK_RUNNING); - - return !t.task ? 0 : -EINTR; -} - -/** - * schedule_hrtimeout_range - sleep until timeout - * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR - */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range_clock(expires, delta, mode, - CLOCK_MONOTONIC); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); - -/** - * schedule_hrtimeout - sleep until timeout - * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR - */ -int __sched schedule_hrtimeout(ktime_t *expires, - const enum hrtimer_mode mode) -{ - return schedule_hrtimeout_range(expires, 0, mode); -} -EXPORT_SYMBOL_GPL(schedule_hrtimeout); -/* - * Detect Hung Task - * - * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * The number of tasks checked: - */ -unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; - -/* - * Limit number of tasks checked in a batch. - * - * This value controls the preemptibility of khungtaskd since preemption - * is disabled during the critical section. It also controls the size of - * the RCU grace period. So it needs to be upper-bound. - */ -#define HUNG_TASK_BATCHING 1024 - -/* - * Zero means infinite timeout - no checking done: - */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; - -unsigned long __read_mostly sysctl_hung_task_warnings = 10; - -static int __read_mostly did_panic; - -static struct task_struct *watchdog_task; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * hung task is detected: - */ -unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; - -static int __init hung_task_panic_setup(char *str) -{ - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - -static int -hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = hung_task_panic, -}; - -static void check_hung_task(struct task_struct *t, unsigned long timeout) -{ - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - return; - } - if (!sysctl_hung_task_warnings) - return; - sysctl_hung_task_warnings--; - - /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: - */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, timeout); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - debug_show_held_locks(t); - - touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) - panic("hung_task: blocked tasks"); -} - -/* - * To avoid extending the RCU grace period for an unbounded amount of time, - * periodically exit the critical section and enter a new one. - * - * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * to exit the grace period. For classic RCU, a reschedule is required. - */ -static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) -{ - bool can_cont; - - get_task_struct(g); - get_task_struct(t); - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - can_cont = pid_alive(g) && pid_alive(t); - put_task_struct(t); - put_task_struct(g); - - return can_cont; -} - -/* - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. - */ -static void check_hung_uninterruptible_tasks(unsigned long timeout) -{ - int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; - struct task_struct *g, *t; - - /* - * If the system crashed already then all bets are off, - * do not report extra hung tasks: - */ - if (test_taint(TAINT_DIE) || did_panic) - return; - - rcu_read_lock(); - do_each_thread(g, t) { - if (!max_count--) - goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; - if (!rcu_lock_break(g, t)) - goto unlock; - } - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, timeout); - } while_each_thread(g, t); - unlock: - rcu_read_unlock(); -} - -static unsigned long timeout_jiffies(unsigned long timeout) -{ - /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; -} - -/* - * Process updating of timeout sysctl - */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write) - goto out; - - wake_up_process(watchdog_task); - - out: - return ret; -} - -/* - * kthread which checks for tasks stuck in D state - */ -static int watchdog(void *dummy) -{ - set_user_nice(current, 0); - - for ( ; ; ) { - unsigned long timeout = sysctl_hung_task_timeout_secs; - - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - check_hung_uninterruptible_tasks(timeout); - } - - return 0; -} - -static int __init hung_task_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); - - return 0; -} - -module_init(hung_task_init); -/* - * linux/kernel/irq/autoprobe.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains the interrupt probing code and driver APIs. - */ - -#include -#include -#include -#include -#include - -#include "internals.h" - -/* - * Autodetection depends on the fact that any interrupt that - * comes in on to an unassigned handler will get stuck with - * "IRQS_WAITING" cleared and the interrupt disabled. - */ -static DEFINE_MUTEX(probing_active); - -/** - * probe_irq_on - begin an interrupt autodetect - * - * Commence probing for an interrupt. The interrupts are scanned - * and a mask of potential interrupt lines is returned. - * - */ -unsigned long probe_irq_on(void) -{ - struct irq_desc *desc; - unsigned long mask = 0; - int i; - - /* - * quiesce the kernel, or at least the asynchronous portion - */ - async_synchronize_full(); - mutex_lock(&probing_active); - /* - * something may have generated an irq long ago and we want to - * flush such a longstanding irq before considering it as spurious. - */ - for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); - if (!desc->action && irq_settings_can_probe(desc)) { - /* - * Some chips need to know about probing in - * progress: - */ - if (desc->irq_data.chip->irq_set_type) - desc->irq_data.chip->irq_set_type(&desc->irq_data, - IRQ_TYPE_PROBE); - irq_startup(desc, false); - } - raw_spin_unlock_irq(&desc->lock); - } - - /* Wait for longstanding interrupts to trigger. */ - msleep(20); - - /* - * enable any unassigned irqs - * (we must startup again here because if a longstanding irq - * happened in the previous stage, it may have masked itself) - */ - for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); - if (!desc->action && irq_settings_can_probe(desc)) { - desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, false)) - desc->istate |= IRQS_PENDING; - } - raw_spin_unlock_irq(&desc->lock); - } - - /* - * Wait for spurious interrupts to trigger - */ - msleep(100); - - /* - * Now filter out any obviously spurious interrupts - */ - for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - - if (desc->istate & IRQS_AUTODETECT) { - /* It triggered already - consider it spurious. */ - if (!(desc->istate & IRQS_WAITING)) { - desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); - } else - if (i < 32) - mask |= 1 << i; - } - raw_spin_unlock_irq(&desc->lock); - } - - return mask; -} -EXPORT_SYMBOL(probe_irq_on); - -/** - * probe_irq_mask - scan a bitmap of interrupt lines - * @val: mask of interrupts to consider - * - * Scan the interrupt lines and return a bitmap of active - * autodetect interrupts. The interrupt probe logic state - * is then returned to its previous value. - * - * Note: we need to scan all the irq's even though we will - * only return autodetect irq numbers - just so that we reset - * them all to a known state. - */ -unsigned int probe_irq_mask(unsigned long val) -{ - unsigned int mask = 0; - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - if (desc->istate & IRQS_AUTODETECT) { - if (i < 16 && !(desc->istate & IRQS_WAITING)) - mask |= 1 << i; - - desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); - } - raw_spin_unlock_irq(&desc->lock); - } - mutex_unlock(&probing_active); - - return mask & val; -} -EXPORT_SYMBOL(probe_irq_mask); - -/** - * probe_irq_off - end an interrupt autodetect - * @val: mask of potential interrupts (unused) - * - * Scans the unused interrupt lines and returns the line which - * appears to have triggered the interrupt. If no interrupt was - * found then zero is returned. If more than one interrupt is - * found then minus the first candidate is returned to indicate - * their is doubt. - * - * The interrupt probe logic state is returned to its previous - * value. - * - * BUGS: When used in a module (which arguably shouldn't happen) - * nothing prevents two IRQ probe callers from overlapping. The - * results of this are non-optimal. - */ -int probe_irq_off(unsigned long val) -{ - int i, irq_found = 0, nr_of_irqs = 0; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - - if (desc->istate & IRQS_AUTODETECT) { - if (!(desc->istate & IRQS_WAITING)) { - if (!nr_of_irqs) - irq_found = i; - nr_of_irqs++; - } - desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); - } - raw_spin_unlock_irq(&desc->lock); - } - mutex_unlock(&probing_active); - - if (nr_of_irqs > 1) - irq_found = -irq_found; - - return irq_found; -} -EXPORT_SYMBOL(probe_irq_off); - -/* - * linux/kernel/irq/chip.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code, for irq-chip - * based architectures. - * - * Detailed information is available in Documentation/DocBook/genericirq - */ - -#include -#include -#include -#include -#include - -#include "internals.h" - -/** - * irq_set_chip - set the irq chip for an irq - * @irq: irq number - * @chip: pointer to irq chip description structure - */ -int irq_set_chip(unsigned int irq, struct irq_chip *chip) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - - if (!chip) - chip = &no_irq_chip; - - desc->irq_data.chip = chip; - irq_put_desc_unlock(desc, flags); - /* - * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is - * already marked, and this call is harmless. - */ - irq_reserve_irq(irq); - return 0; -} -EXPORT_SYMBOL(irq_set_chip); - -/** - * irq_set_type - set the irq trigger type for an irq - * @irq: irq number - * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h - */ -int irq_set_irq_type(unsigned int irq, unsigned int type) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; - - type &= IRQ_TYPE_SENSE_MASK; - if (type != IRQ_TYPE_NONE) - ret = __irq_set_trigger(desc, irq, type); - irq_put_desc_busunlock(desc, flags); - return ret; -} -EXPORT_SYMBOL(irq_set_irq_type); - -/** - * irq_set_handler_data - set irq handler data for an irq - * @irq: Interrupt number - * @data: Pointer to interrupt specific data - * - * Set the hardware irq controller data for an irq - */ -int irq_set_handler_data(unsigned int irq, void *data) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_data.handler_data = data; - irq_put_desc_unlock(desc, flags); - return 0; -} -EXPORT_SYMBOL(irq_set_handler_data); - -/** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data - * - * Set the MSI descriptor entry for an irq - */ -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - desc->irq_data.msi_desc = entry; - if (entry) - entry->irq = irq; - irq_put_desc_unlock(desc, flags); - return 0; -} - -/** - * irq_set_chip_data - set irq chip data for an irq - * @irq: Interrupt number - * @data: Pointer to chip specific data - * - * Set the hardware irq chip data for an irq - */ -int irq_set_chip_data(unsigned int irq, void *data) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_data.chip_data = data; - irq_put_desc_unlock(desc, flags); - return 0; -} -EXPORT_SYMBOL(irq_set_chip_data); - -struct irq_data *irq_get_irq_data(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc ? &desc->irq_data : NULL; -} -EXPORT_SYMBOL_GPL(irq_get_irq_data); - -static void irq_state_clr_disabled(struct irq_desc *desc) -{ - irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); -} - -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - -static void irq_state_clr_masked(struct irq_desc *desc) -{ - irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); -} - -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - -int irq_startup(struct irq_desc *desc, bool resend) -{ - int ret = 0; - - irq_state_clr_disabled(desc); - desc->depth = 0; - - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - irq_enable(desc); - } - if (resend) - check_irq_resend(desc, desc->irq_data.irq); - return ret; -} - -void irq_shutdown(struct irq_desc *desc) -{ - irq_state_set_disabled(desc); - desc->depth = 1; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else if (desc->irq_data.chip->irq_disable) - desc->irq_data.chip->irq_disable(&desc->irq_data); - else - desc->irq_data.chip->irq_mask(&desc->irq_data); - irq_state_set_masked(desc); -} - -void irq_enable(struct irq_desc *desc) -{ - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) - desc->irq_data.chip->irq_enable(&desc->irq_data); - else - desc->irq_data.chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); -} - -void irq_disable(struct irq_desc *desc) -{ - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } -} - -void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) -{ - if (desc->irq_data.chip->irq_enable) - desc->irq_data.chip->irq_enable(&desc->irq_data); - else - desc->irq_data.chip->irq_unmask(&desc->irq_data); - cpumask_set_cpu(cpu, desc->percpu_enabled); -} - -void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) -{ - if (desc->irq_data.chip->irq_disable) - desc->irq_data.chip->irq_disable(&desc->irq_data); - else - desc->irq_data.chip->irq_mask(&desc->irq_data); - cpumask_clear_cpu(cpu, desc->percpu_enabled); -} - -static inline void mask_ack_irq(struct irq_desc *desc) -{ - if (desc->irq_data.chip->irq_mask_ack) - desc->irq_data.chip->irq_mask_ack(&desc->irq_data); - else { - desc->irq_data.chip->irq_mask(&desc->irq_data); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); - } - irq_state_set_masked(desc); -} - -void mask_irq(struct irq_desc *desc) -{ - if (desc->irq_data.chip->irq_mask) { - desc->irq_data.chip->irq_mask(&desc->irq_data); - irq_state_set_masked(desc); - } -} - -void unmask_irq(struct irq_desc *desc) -{ - if (desc->irq_data.chip->irq_unmask) { - desc->irq_data.chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); - } -} - -/* - * handle_nested_irq - Handle a nested irq from a irq thread - * @irq: the interrupt number - * - * Handle interrupts which are nested into a threaded interrupt - * handler. The handler function is called inside the calling - * threads context. - */ -void handle_nested_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - irqreturn_t action_ret; - - might_sleep(); - - raw_spin_lock_irq(&desc->lock); - - kstat_incr_irqs_this_cpu(irq, desc); - - action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) - goto out_unlock; - - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); - raw_spin_unlock_irq(&desc->lock); - - action_ret = action->thread_fn(action->irq, action->dev_id); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock_irq(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock_irq(&desc->lock); -} -EXPORT_SYMBOL_GPL(handle_nested_irq); - -static bool irq_check_poll(struct irq_desc *desc) -{ - if (!(desc->istate & IRQS_POLL_INPROGRESS)) - return false; - return irq_wait_for_poll(desc); -} - -/** - * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Simple interrupts are either sent from a demultiplexing interrupt - * handler or come from hardware, where no interrupt hardware control - * is necessary. - * - * Note: The caller is expected to handle the ack, clear, mask and - * unmask issues if necessary. - */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) -{ - raw_spin_lock(&desc->lock); - - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); - - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) - goto out_unlock; - - handle_irq_event(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); -} -EXPORT_SYMBOL_GPL(handle_simple_irq); - -/* - * Called unconditionally from handle_level_irq() and only for oneshot - * interrupts from handle_fasteoi_irq() - */ -static void cond_unmask_irq(struct irq_desc *desc) -{ - /* - * We need to unmask in the following cases: - * - Standard level irq (IRQF_ONESHOT is not set) - * - Oneshot irq which did not wake the thread (caused by a - * spurious interrupt or a primary handler handling it - * completely). - */ - if (!irqd_irq_disabled(&desc->irq_data) && - irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) - unmask_irq(desc); -} - -/** - * handle_level_irq - Level type irq handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Level type interrupts are active as long as the hardware line has - * the active level. This may require to mask the interrupt and unmask - * it after the associated handler has acknowledged the device, so the - * interrupt line is back to inactive. - */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) -{ - raw_spin_lock(&desc->lock); - mask_ack_irq(desc); - - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); - - /* - * If its disabled or no action available - * keep it masked and get out of here - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) - goto out_unlock; - - handle_irq_event(desc); - - cond_unmask_irq(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); -} -EXPORT_SYMBOL_GPL(handle_level_irq); - -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void preflow_handler(struct irq_desc *desc) -{ - if (desc->preflow_handler) - desc->preflow_handler(&desc->irq_data); -} -#else -static inline void preflow_handler(struct irq_desc *desc) { } -#endif - -/** - * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Only a single callback will be issued to the chip: an ->eoi() - * call when the interrupt has been serviced. This enables support - * for modern forms of interrupt handlers, which handle the flow - * details in hardware, transparently. - */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) -{ - raw_spin_lock(&desc->lock); - - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - mask_irq(desc); - goto out; - } - - if (desc->istate & IRQS_ONESHOT) - mask_irq(desc); - - preflow_handler(desc); - handle_irq_event(desc); - - if (desc->istate & IRQS_ONESHOT) - cond_unmask_irq(desc); - -out_eoi: - desc->irq_data.chip->irq_eoi(&desc->irq_data); -out_unlock: - raw_spin_unlock(&desc->lock); - return; -out: - if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) - goto out_eoi; - goto out_unlock; -} - -/** - * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurrence is latched into the irq controller hardware - * and must be acked in order to be reenabled. After the ack another - * interrupt can happen on the same source even before the first one - * is handled by the associated event handler. If this happens it - * might be necessary to disable (mask) the interrupt depending on the - * controller hardware. This requires to reenable the interrupt inside - * of the loop which handles the interrupts which have arrived while - * the handler was running. If all pending interrupts are handled, the - * loop is left. - */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) -{ - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out - */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; - } - } - kstat_incr_irqs_this_cpu(irq, desc); - - /* Start handling the irq */ - desc->irq_data.chip->irq_ack(&desc->irq_data); - - do { - if (unlikely(!desc->action)) { - mask_irq(desc); - goto out_unlock; - } - - /* - * When another irq arrived while we were handling - * one, we could have masked the irq. - * Renable it, if it was not disabled in meantime. - */ - if (unlikely(desc->istate & IRQS_PENDING)) { - if (!irqd_irq_disabled(&desc->irq_data) && - irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); - } - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_unlock: - raw_spin_unlock(&desc->lock); -} - -#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER -/** - * handle_edge_eoi_irq - edge eoi type IRQ handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Similar as the above handle_edge_irq, but using eoi and w/o the - * mask/unmask logic. - */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out - */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - } - kstat_incr_irqs_this_cpu(irq, desc); - - do { - if (unlikely(!desc->action)) - goto out_eoi; - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_eoi: - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); -} -#endif - -/** - * handle_percpu_irq - Per CPU local irq handler - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Per CPU interrupts on SMP machines without locking requirements - */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - kstat_incr_irqs_this_cpu(irq, desc); - - if (chip->irq_ack) - chip->irq_ack(&desc->irq_data); - - handle_irq_event_percpu(desc, desc->action); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - -/** - * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq: the interrupt number - * @desc: the interrupt description structure for this irq - * - * Per CPU interrupts on SMP machines without locking requirements. Same as - * handle_percpu_irq() above but with the following extras: - * - * action->percpu_dev_id is a pointer to percpu variables which - * contain the real device id for the cpu on which this handler is - * called - */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - void *dev_id = __this_cpu_ptr(action->percpu_dev_id); - irqreturn_t res; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (chip->irq_ack) - chip->irq_ack(&desc->irq_data); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, dev_id); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - -void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - - if (!handle) { - handle = handle_bad_irq; - } else { - if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) - goto out; - } - - /* Uninstall? */ - if (handle == handle_bad_irq) { - if (desc->irq_data.chip != &no_irq_chip) - mask_ack_irq(desc); - irq_state_set_disabled(desc); - desc->depth = 1; - } - desc->handle_irq = handle; - desc->name = name; - - if (handle != handle_bad_irq && is_chained) { - irq_settings_set_noprobe(desc); - irq_settings_set_norequest(desc); - irq_settings_set_nothread(desc); - irq_startup(desc, true); - } -out: - irq_put_desc_busunlock(desc, flags); -} -EXPORT_SYMBOL_GPL(__irq_set_handler); - -void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle, const char *name) -{ - irq_set_chip(irq, chip); - __irq_set_handler(irq, handle, 0, name); -} - -void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return; - irq_settings_clr_and_set(desc, clr, set); - - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); - if (irq_settings_has_no_balance_set(desc)) - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - if (irq_settings_is_per_cpu(desc)) - irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_can_move_pcntxt(desc)) - irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); - if (irq_settings_is_level(desc)) - irqd_set(&desc->irq_data, IRQD_LEVEL); - - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); - - irq_put_desc_unlock(desc, flags); -} -EXPORT_SYMBOL_GPL(irq_modify_status); - -/** - * irq_cpu_online - Invoke all irq_cpu_online functions. - * - * Iterate through all irqs and invoke the chip.irq_cpu_online() - * for each. - */ -void irq_cpu_online(void) -{ - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; - unsigned int irq; - - for_each_active_irq(irq) { - desc = irq_to_desc(irq); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - - chip = irq_data_get_irq_chip(&desc->irq_data); - if (chip && chip->irq_cpu_online && - (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !irqd_irq_disabled(&desc->irq_data))) - chip->irq_cpu_online(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); - } -} - -/** - * irq_cpu_offline - Invoke all irq_cpu_offline functions. - * - * Iterate through all irqs and invoke the chip.irq_cpu_offline() - * for each. - */ -void irq_cpu_offline(void) -{ - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; - unsigned int irq; - - for_each_active_irq(irq) { - desc = irq_to_desc(irq); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - - chip = irq_data_get_irq_chip(&desc->irq_data); - if (chip && chip->irq_cpu_offline && - (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !irqd_irq_disabled(&desc->irq_data))) - chip->irq_cpu_offline(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); - } -} -#include -#include -#include -#include - -/* - * Device resource management aware IRQ request/free implementation. - */ -struct irq_devres { - unsigned int irq; - void *dev_id; -}; - -static void devm_irq_release(struct device *dev, void *res) -{ - struct irq_devres *this = res; - - free_irq(this->irq, this->dev_id); -} - -static int devm_irq_match(struct device *dev, void *res, void *data) -{ - struct irq_devres *this = res, *match = data; - - return this->irq == match->irq && this->dev_id == match->dev_id; -} - -/** - * devm_request_threaded_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, devm_free_irq() must be used. - */ -int devm_request_threaded_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, irq_handler_t thread_fn, - unsigned long irqflags, const char *devname, - void *dev_id) -{ - struct irq_devres *dr; - int rc; - - dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), - GFP_KERNEL); - if (!dr) - return -ENOMEM; - - rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, - dev_id); - if (rc) { - devres_free(dr); - return rc; - } - - dr->irq = irq; - dr->dev_id = dev_id; - devres_add(dev, dr); - - return 0; -} -EXPORT_SYMBOL(devm_request_threaded_irq); - -/** - * devm_free_irq - free an interrupt - * @dev: device to free interrupt for - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as free_irq(). - * This function instead of free_irq() should be used to manually - * free IRQs allocated with devm_request_irq(). - */ -void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) -{ - struct irq_devres match_data = { irq, dev_id }; - - WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, - &match_data)); - free_irq(irq, dev_id); -} -EXPORT_SYMBOL(devm_free_irq); -/* - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the dummy interrupt chip implementation - */ -#include -#include - -#include "internals.h" - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - print_irq_desc(data->irq, desc); - ack_bad_irq(data->irq); -} - -/* - * NOP functions - */ -static void noop(struct irq_data *data) { } - -static unsigned int noop_ret(struct irq_data *data) -{ - return 0; -} - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .irq_startup = noop_ret, - .irq_shutdown = noop, - .irq_enable = noop, - .irq_disable = noop, - .irq_ack = ack_bad, -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .irq_startup = noop_ret, - .irq_shutdown = noop, - .irq_enable = noop, - .irq_disable = noop, - .irq_ack = noop, - .irq_mask = noop, - .irq_unmask = noop, -}; -/* - * Library implementing the most common irq chip callback functions - * - * Copyright (C) 2011, Thomas Gleixner - */ -#include -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -static LIST_HEAD(gc_list); -static DEFINE_RAW_SPINLOCK(gc_lock); - -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ - return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} - -/** - * irq_gc_noop - NOOP function - * @d: irq_data - */ -void irq_gc_noop(struct irq_data *d) -{ -} - -/** - * irq_gc_mask_disable_reg - Mask chip via disable register - * @d: irq_data - * - * Chip has separate enable/disable registers instead of a single mask - * register. - */ -void irq_gc_mask_disable_reg(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); - gc->mask_cache &= ~mask; - irq_gc_unlock(gc); -} - -/** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register - * @d: irq_data - * - * Chip has a single mask register. Values of this register are cached - * and protected by gc->lock - */ -void irq_gc_mask_set_bit(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); - irq_gc_unlock(gc); -} - -/** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register - * @d: irq_data - * - * Chip has a single mask register. Values of this register are cached - * and protected by gc->lock - */ -void irq_gc_mask_clr_bit(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); - irq_gc_unlock(gc); -} - -/** - * irq_gc_unmask_enable_reg - Unmask chip via enable register - * @d: irq_data - * - * Chip has separate enable/disable registers instead of a single mask - * register. - */ -void irq_gc_unmask_enable_reg(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); - gc->mask_cache |= mask; - irq_gc_unlock(gc); -} - -/** - * irq_gc_ack_set_bit - Ack pending interrupt via setting bit - * @d: irq_data - */ -void irq_gc_ack_set_bit(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); -} - -/** - * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit - * @d: irq_data - */ -void irq_gc_ack_clr_bit(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = ~(1 << (d->irq - gc->irq_base)); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); -} - -/** - * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt - * @d: irq_data - */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); -} - -/** - * irq_gc_eoi - EOI interrupt - * @d: irq_data - */ -void irq_gc_eoi(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); - irq_gc_unlock(gc); -} - -/** - * irq_gc_set_wake - Set/clr wake bit for an interrupt - * @d: irq_data - * - * For chips where the wake from suspend functionality is not - * configured in a separate register and the wakeup active state is - * just stored in a bitmask. - */ -int irq_gc_set_wake(struct irq_data *d, unsigned int on) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); - - if (!(mask & gc->wake_enabled)) - return -EINVAL; - - irq_gc_lock(gc); - if (on) - gc->wake_active |= mask; - else - gc->wake_active &= ~mask; - irq_gc_unlock(gc); - return 0; -} - -/** - * irq_alloc_generic_chip - Allocate a generic chip and initialize it - * @name: Name of the irq chip - * @num_ct: Number of irq_chip_type instances associated with this - * @irq_base: Interrupt base nr for this chip - * @reg_base: Register base address (virtual) - * @handler: Default flow handler associated with this chip - * - * Returns an initialized irq_chip_generic structure. The chip defaults - * to the primary (index 0) irq_chip_type and @handler - */ -struct irq_chip_generic * -irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, - void __iomem *reg_base, irq_flow_handler_t handler) -{ - struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - - gc = kzalloc(sz, GFP_KERNEL); - if (gc) { - raw_spin_lock_init(&gc->lock); - gc->num_ct = num_ct; - gc->irq_base = irq_base; - gc->reg_base = reg_base; - gc->chip_types->chip.name = name; - gc->chip_types->handler = handler; - } - return gc; -} -EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); - -/* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. - */ -static struct lock_class_key irq_nested_lock_class; - -/** - * irq_setup_generic_chip - Setup a range of interrupts with a generic chip - * @gc: Generic irq chip holding all data - * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base - * @flags: Flags for initialization - * @clr: IRQ_* bits to clear - * @set: IRQ_* bits to set - * - * Set up max. 32 interrupts starting from gc->irq_base. Note, this - * initializes all interrupts to the primary irq_chip_type and its - * associated handler. - */ -void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, - enum irq_gc_flags flags, unsigned int clr, - unsigned int set) -{ - struct irq_chip_type *ct = gc->chip_types; - unsigned int i; - - raw_spin_lock(&gc_lock); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock(&gc_lock); - - /* Init mask cache ? */ - if (flags & IRQ_GC_INIT_MASK_CACHE) - gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); - - for (i = gc->irq_base; msk; msk >>= 1, i++) { - if (!(msk & 0x01)) - continue; - - if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); - - irq_set_chip_and_handler(i, &ct->chip, ct->handler); - irq_set_chip_data(i, gc); - irq_modify_status(i, clr, set); - } - gc->irq_cnt = i - gc->irq_base; -} -EXPORT_SYMBOL_GPL(irq_setup_generic_chip); - -/** - * irq_setup_alt_chip - Switch to alternative chip - * @d: irq_data for this interrupt - * @type Flow type to be initialized - * - * Only to be called from chip->irq_set_type() callbacks. - */ -int irq_setup_alt_chip(struct irq_data *d, unsigned int type) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct irq_chip_type *ct = gc->chip_types; - unsigned int i; - - for (i = 0; i < gc->num_ct; i++, ct++) { - if (ct->type & type) { - d->chip = &ct->chip; - irq_data_to_desc(d)->handle_irq = ct->handler; - return 0; - } - } - return -EINVAL; -} -EXPORT_SYMBOL_GPL(irq_setup_alt_chip); - -/** - * irq_remove_generic_chip - Remove a chip - * @gc: Generic irq chip holding all data - * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base - * @clr: IRQ_* bits to clear - * @set: IRQ_* bits to set - * - * Remove up to 32 interrupts starting from gc->irq_base. - */ -void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, - unsigned int clr, unsigned int set) -{ - unsigned int i = gc->irq_base; - - raw_spin_lock(&gc_lock); - list_del(&gc->list); - raw_spin_unlock(&gc_lock); - - for (; msk; msk >>= 1, i++) { - if (!(msk & 0x01)) - continue; - - /* Remove handler first. That will mask the irq line */ - irq_set_handler(i, NULL); - irq_set_chip(i, &no_irq_chip); - irq_set_chip_data(i, NULL); - irq_modify_status(i, clr, set); - } -} -EXPORT_SYMBOL_GPL(irq_remove_generic_chip); - -#ifdef CONFIG_PM -static int irq_gc_suspend(void) -{ - struct irq_chip_generic *gc; - - list_for_each_entry(gc, &gc_list, list) { - struct irq_chip_type *ct = gc->chip_types; - - if (ct->chip.irq_suspend) - ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); - } - return 0; -} - -static void irq_gc_resume(void) -{ - struct irq_chip_generic *gc; - - list_for_each_entry(gc, &gc_list, list) { - struct irq_chip_type *ct = gc->chip_types; - - if (ct->chip.irq_resume) - ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); - } -} -#else -#define irq_gc_suspend NULL -#define irq_gc_resume NULL -#endif - -static void irq_gc_shutdown(void) -{ - struct irq_chip_generic *gc; - - list_for_each_entry(gc, &gc_list, list) { - struct irq_chip_type *ct = gc->chip_types; - - if (ct->chip.irq_pm_shutdown) - ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); - } -} - -static struct syscore_ops irq_gc_syscore_ops = { - .suspend = irq_gc_suspend, - .resume = irq_gc_resume, - .shutdown = irq_gc_shutdown, -}; - -static int __init irq_gc_init_ops(void) -{ - register_syscore_ops(&irq_gc_syscore_ops); - return 0; -} -device_initcall(irq_gc_init_ops); -/* - * linux/kernel/irq/handle.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/DocBook/genericirq - * - */ - -#include -#include -#include -#include -#include - -#include - -#include "internals.h" - -/** - * handle_bad_irq - handle spurious and unhandled irqs - * @irq: the interrupt number - * @desc: description of the interrupt - * - * Handles spurious and unhandled IRQ's. It also prints a debugmessage. - */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) -{ - print_irq_desc(irq, desc); - kstat_incr_irqs_this_cpu(irq, desc); - ack_bad_irq(irq); -} - -/* - * Special, empty irq handler: - */ -irqreturn_t no_action(int cpl, void *dev_id) -{ - return IRQ_NONE; -} - -static void warn_no_thread(unsigned int irq, struct irqaction *action) -{ - if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) - return; - - printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " - "but no thread function available.", irq, action->name); -} - -static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) -{ - /* - * Wake up the handler thread for this action. In case the - * thread crashed and was killed we just pretend that we - * handled the interrupt. The hardirq handler has disabled the - * device interrupt, so no irq storm is lurking. If the - * RUNTHREAD bit is already set, nothing to do. - */ - if (test_bit(IRQTF_DIED, &action->thread_flags) || - test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - return; - - /* - * It's safe to OR the mask lockless here. We have only two - * places which write to threads_oneshot: This code and the - * irq thread. - * - * This code is the hard irq context and can never run on two - * cpus in parallel. If it ever does we have more serious - * problems than this bitmask. - * - * The irq threads of this irq which clear their "running" bit - * in threads_oneshot are serialized via desc->lock against - * each other and they are serialized against this code by - * IRQS_INPROGRESS. - * - * Hard irq handler: - * - * spin_lock(desc->lock); - * desc->state |= IRQS_INPROGRESS; - * spin_unlock(desc->lock); - * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - * desc->threads_oneshot |= mask; - * spin_lock(desc->lock); - * desc->state &= ~IRQS_INPROGRESS; - * spin_unlock(desc->lock); - * - * irq thread: - * - * again: - * spin_lock(desc->lock); - * if (desc->state & IRQS_INPROGRESS) { - * spin_unlock(desc->lock); - * while(desc->state & IRQS_INPROGRESS) - * cpu_relax(); - * goto again; - * } - * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - * desc->threads_oneshot &= ~mask; - * spin_unlock(desc->lock); - * - * So either the thread waits for us to clear IRQS_INPROGRESS - * or we are waiting in the flow handler for desc->lock to be - * released before we reach this point. The thread also checks - * IRQTF_RUNTHREAD under desc->lock. If set it leaves - * threads_oneshot untouched and runs the thread another time. - */ - desc->threads_oneshot |= action->thread_mask; - wake_up_process(action->thread); -} - -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) -{ - irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; - - do { - irqreturn_t res; - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, res); - - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", - irq, action->handler)) - local_irq_disable(); - - switch (res) { - case IRQ_WAKE_THREAD: - /* - * Catch drivers which return WAKE_THREAD but - * did not set up a thread function - */ - if (unlikely(!action->thread_fn)) { - warn_no_thread(irq, action); - break; - } - - irq_wake_thread(desc, action); - - /* Fall through to add to randomness */ - case IRQ_HANDLED: - random |= action->flags; - break; - - default: - break; - } - - retval |= res; - action = action->next; - } while (action); - - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); - - if (!noirqdebug) - note_interrupt(irq, desc, retval); - return retval; -} - -irqreturn_t handle_irq_event(struct irq_desc *desc) -{ - struct irqaction *action = desc->action; - irqreturn_t ret; - - desc->istate &= ~IRQS_PENDING; - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); - raw_spin_unlock(&desc->lock); - - ret = handle_irq_event_percpu(desc, action); - - raw_spin_lock(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - return ret; -} -/* - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the interrupt descriptor management code - * - * Detailed information is available in Documentation/DocBook/genericirq - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - -#if defined(CONFIG_SMP) -static void __init init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); -} -#else -static void __init init_irq_default_affinity(void) -{ -} -#endif - -#ifdef CONFIG_SMP -static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) -{ - if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) - return -ENOMEM; - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); - return -ENOMEM; - } -#endif - return 0; -} - -static void desc_smp_init(struct irq_desc *desc, int node) -{ - desc->irq_data.node = node; - cpumask_copy(desc->irq_data.affinity, irq_default_affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -} - -static inline int desc_node(struct irq_desc *desc) -{ - return desc->irq_data.node; -} - -#else -static inline int -alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } -static inline int desc_node(struct irq_desc *desc) { return 0; } -#endif - -static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, - struct module *owner) -{ - int cpu; - - desc->irq_data.irq = irq; - desc->irq_data.chip = &no_irq_chip; - desc->irq_data.chip_data = NULL; - desc->irq_data.handler_data = NULL; - desc->irq_data.msi_desc = NULL; - irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->irq_count = 0; - desc->irqs_unhandled = 0; - desc->name = NULL; - desc->owner = owner; - for_each_possible_cpu(cpu) - *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; - desc_smp_init(desc, node); -} - -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); - -static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); - -#ifdef CONFIG_SPARSE_IRQ - -static RADIX_TREE(irq_desc_tree, GFP_KERNEL); - -static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return radix_tree_lookup(&irq_desc_tree, irq); -} - -static void delete_irq_desc(unsigned int irq) -{ - radix_tree_delete(&irq_desc_tree, irq); -} - -#ifdef CONFIG_SMP -static void free_masks(struct irq_desc *desc) -{ -#ifdef CONFIG_GENERIC_PENDING_IRQ - free_cpumask_var(desc->pending_mask); -#endif - free_cpumask_var(desc->irq_data.affinity); -} -#else -static inline void free_masks(struct irq_desc *desc) { } -#endif - -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) -{ - struct irq_desc *desc; - gfp_t gfp = GFP_KERNEL; - - desc = kzalloc_node(sizeof(*desc), gfp, node); - if (!desc) - return NULL; - /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = alloc_percpu(unsigned int); - if (!desc->kstat_irqs) - goto err_desc; - - if (alloc_masks(desc, gfp, node)) - goto err_kstat; - - raw_spin_lock_init(&desc->lock); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - - desc_set_defaults(irq, desc, node, owner); - - return desc; - -err_kstat: - free_percpu(desc->kstat_irqs); -err_desc: - kfree(desc); - return NULL; -} - -static void free_desc(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - unregister_irq_proc(irq, desc); - - mutex_lock(&sparse_irq_lock); - delete_irq_desc(irq); - mutex_unlock(&sparse_irq_lock); - - free_masks(desc); - free_percpu(desc->kstat_irqs); - kfree(desc); -} - -static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) -{ - struct irq_desc *desc; - int i; - - for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node, owner); - if (!desc) - goto err; - mutex_lock(&sparse_irq_lock); - irq_insert_desc(start + i, desc); - mutex_unlock(&sparse_irq_lock); - } - return start; - -err: - for (i--; i >= 0; i--) - free_desc(start + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return -ENOMEM; -} - -static int irq_expand_nr_irqs(unsigned int nr) -{ - if (nr > IRQ_BITMAP_BITS) - return -ENOMEM; - nr_irqs = nr; - return 0; -} - -int __init early_irq_init(void) -{ - int i, initcnt, node = first_online_node; - struct irq_desc *desc; - - init_irq_default_affinity(); - - /* Let arch update nr_irqs and return the nr of preallocated irqs */ - initcnt = arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); - - if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) - nr_irqs = IRQ_BITMAP_BITS; - - if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) - initcnt = IRQ_BITMAP_BITS; - - if (initcnt > nr_irqs) - nr_irqs = initcnt; - - for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node, NULL); - set_bit(i, allocated_irqs); - irq_insert_desc(i, desc); - } - return arch_early_irq_init(); -} - -#else /* !CONFIG_SPARSE_IRQ */ - -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), - } -}; - -int __init early_irq_init(void) -{ - int count, i, node = first_online_node; - struct irq_desc *desc; - - init_irq_default_affinity(); - - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); - - desc = irq_desc; - count = ARRAY_SIZE(irq_desc); - - for (i = 0; i < count; i++) { - desc[i].kstat_irqs = alloc_percpu(unsigned int); - alloc_masks(&desc[i], GFP_KERNEL, node); - raw_spin_lock_init(&desc[i].lock); - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node, NULL); - } - return arch_early_irq_init(); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} - -static void free_desc(unsigned int irq) -{ - dynamic_irq_cleanup(irq); -} - -static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) -{ - u32 i; - - for (i = 0; i < cnt; i++) { - struct irq_desc *desc = irq_to_desc(start + i); - - desc->owner = owner; - } - return start; -} - -static int irq_expand_nr_irqs(unsigned int nr) -{ - return -ENOMEM; -} - -#endif /* !CONFIG_SPARSE_IRQ */ - -/** - * generic_handle_irq - Invoke the handler for a particular irq - * @irq: The irq number to handle - * - */ -int generic_handle_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return -EINVAL; - generic_handle_irq_desc(irq, desc); - return 0; -} -EXPORT_SYMBOL_GPL(generic_handle_irq); - -/* Dynamic interrupt handling */ - -/** - * irq_free_descs - free irq descriptors - * @from: Start of descriptor range - * @cnt: Number of consecutive irqs to free - */ -void irq_free_descs(unsigned int from, unsigned int cnt) -{ - int i; - - if (from >= nr_irqs || (from + cnt) > nr_irqs) - return; - - for (i = 0; i < cnt; i++) - free_desc(from + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, from, cnt); - mutex_unlock(&sparse_irq_lock); -} -EXPORT_SYMBOL_GPL(irq_free_descs); - -/** - * irq_alloc_descs - allocate and initialize a range of irq descriptors - * @irq: Allocate for specific irq number if irq >= 0 - * @from: Start the search from this irq number - * @cnt: Number of consecutive irqs to allocate. - * @node: Preferred node on which the irq descriptor should be allocated - * @owner: Owning module (can be NULL) - * - * Returns the first irq number or error code - */ -int __ref -__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) -{ - int start, ret; - - if (!cnt) - return -EINVAL; - - if (irq >= 0) { - if (from > irq) - return -EINVAL; - from = irq; - } - - mutex_lock(&sparse_irq_lock); - - start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, - from, cnt, 0); - ret = -EEXIST; - if (irq >=0 && start != irq) - goto err; - - if (start + cnt > nr_irqs) { - ret = irq_expand_nr_irqs(start + cnt); - if (ret) - goto err; - } - - bitmap_set(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); - -err: - mutex_unlock(&sparse_irq_lock); - return ret; -} -EXPORT_SYMBOL_GPL(__irq_alloc_descs); - -/** - * irq_reserve_irqs - mark irqs allocated - * @from: mark from irq number - * @cnt: number of irqs to mark - * - * Returns 0 on success or an appropriate error code - */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) -{ - unsigned int start; - int ret = 0; - - if (!cnt || (from + cnt) > nr_irqs) - return -EINVAL; - - mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); - if (start == from) - bitmap_set(allocated_irqs, start, cnt); - else - ret = -EEXIST; - mutex_unlock(&sparse_irq_lock); - return ret; -} - -/** - * irq_get_next_irq - get next allocated irq number - * @offset: where to start the search - * - * Returns next irq number after offset or nr_irqs if none is found. - */ -unsigned int irq_get_next_irq(unsigned int offset) -{ - return find_next_bit(allocated_irqs, nr_irqs, offset); -} - -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc) { - if (check & _IRQ_DESC_CHECK) { - if ((check & _IRQ_DESC_PERCPU) && - !irq_settings_is_per_cpu_devid(desc)) - return NULL; - - if (!(check & _IRQ_DESC_PERCPU) && - irq_settings_is_per_cpu_devid(desc)) - return NULL; - } - - if (bus) - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, *flags); - } - return desc; -} - -void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) -{ - raw_spin_unlock_irqrestore(&desc->lock, flags); - if (bus) - chip_bus_sync_unlock(desc); -} - -int irq_set_percpu_devid(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return -EINVAL; - - if (desc->percpu_enabled) - return -EINVAL; - - desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); - - if (!desc->percpu_enabled) - return -ENOMEM; - - irq_set_percpu_devid_flags(irq); - return 0; -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - - return desc && desc->kstat_irqs ? - *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; -} - -unsigned int kstat_irqs(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - int cpu; - int sum = 0; - - if (!desc || !desc->kstat_irqs) - return 0; - for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); - return sum; -} -#include -#include -#include -#include -#include -#include -#include - -static LIST_HEAD(irq_domain_list); -static DEFINE_MUTEX(irq_domain_mutex); - -/** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure - * - * Registers an irq_domain structure. The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value. Everything else is optional. - */ -void irq_domain_add(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - /* - * This assumes that the irq_domain owner has already allocated - * the irq_descs. This block will be removed when support for dynamic - * allocation of irq_descs is added to irq_domain. - */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - if (!d) { - WARN(1, "error: assigning domain to non existant irq_desc"); - return; - } - if (d->domain) { - /* things are broken; just report, don't clean up */ - WARN(1, "error: irq_desc already assigned to a domain"); - return; - } - d->domain = domain; - d->hwirq = hwirq; - } - - mutex_lock(&irq_domain_mutex); - list_add(&domain->list, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); -} - -/** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. - */ -void irq_domain_del(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - mutex_lock(&irq_domain_mutex); - list_del(&domain->list); - mutex_unlock(&irq_domain_mutex); - - /* Clear the irq_domain assignments */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - d->domain = NULL; - } -} - -#if defined(CONFIG_OF_IRQ) -/** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec - * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number. Returns either a valid - * linux IRQ number or 0. - * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). - */ -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *domain; - unsigned long hwirq; - unsigned int irq, type; - int rc = -EINVAL; - - /* Find a domain which can translate the irq spec */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, list) { - if (!domain->ops->dt_translate) - continue; - rc = domain->ops->dt_translate(domain, controller, - intspec, intsize, &hwirq, &type); - if (rc == 0) - break; - } - mutex_unlock(&irq_domain_mutex); - - if (rc != 0) - return 0; - - irq = irq_domain_to_irq(domain, hwirq); - if (type != IRQ_TYPE_NONE) - irq_set_irq_type(irq, type); - pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", - controller->full_name, (int)hwirq, irq, type); - return irq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - -/** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded - * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). - */ -void irq_dispose_mapping(unsigned int irq) -{ - /* - * nothing yet; will be filled when support for dynamic allocation of - * irq_descs is added to irq_domain - */ -} -EXPORT_SYMBOL_GPL(irq_dispose_mapping); - -int irq_domain_simple_dt_translate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) -{ - if (d->of_node != controller) - return -EINVAL; - if (intsize < 1) - return -EINVAL; - if (d->nr_irq && ((intspec[0] < d->hwirq_base) || - (intspec[0] >= d->hwirq_base + d->nr_irq))) - return -EINVAL; - - *out_hwirq = intspec[0]; - *out_type = IRQ_TYPE_NONE; - if (intsize > 1) - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; - return 0; -} - -/** - * irq_domain_create_simple() - Set up a 'simple' translation range - */ -void irq_domain_add_simple(struct device_node *controller, int irq_base) -{ - struct irq_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) { - WARN_ON(1); - return; - } - - domain->irq_base = irq_base; - domain->of_node = of_node_get(controller); - domain->ops = &irq_domain_simple_ops; - irq_domain_add(domain); -} -EXPORT_SYMBOL_GPL(irq_domain_add_simple); - -void irq_domain_generate_simple(const struct of_device_id *match, - u64 phys_base, unsigned int irq_start) -{ - struct device_node *node; - pr_info("looking for phys_base=%llx, irq_start=%i\n", - (unsigned long long) phys_base, (int) irq_start); - node = of_find_matching_node_by_address(NULL, match, phys_base); - if (node) - irq_domain_add_simple(node, irq_start); - else - pr_info("no node found\n"); -} -EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ - .dt_translate = irq_domain_simple_dt_translate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -/* - * linux/kernel/irq/manage.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006 Thomas Gleixner - * - * This file contains driver APIs to the irq subsystem. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -#ifdef CONFIG_IRQ_FORCED_THREADING -__read_mostly bool force_irqthreads; - -static int __init setup_forced_irqthreads(char *arg) -{ - force_irqthreads = true; - return 0; -} -early_param("threadirqs", setup_forced_irqthreads); -#endif - -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - bool inprogress; - - if (!desc) - return; - - do { - unsigned long flags; - - /* - * Wait until we're out of the critical section. This might - * give the wrong answer due to the lack of memory barriers. - */ - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - - /* Ok, that indicated we're done: double-check carefully. */ - raw_spin_lock_irqsave(&desc->lock, flags); - inprogress = irqd_irq_inprogress(&desc->irq_data); - raw_spin_unlock_irqrestore(&desc->lock, flags); - - /* Oops, that failed? */ - } while (inprogress); - - /* - * We made sure that no hardirq handler is running. Now verify - * that no threaded handlers are active. - */ - wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); -} -EXPORT_SYMBOL(synchronize_irq); - -#ifdef CONFIG_SMP -cpumask_var_t irq_default_affinity; - -/** - * irq_can_set_affinity - Check if the affinity of a given irq can be set - * @irq: Interrupt to check - * - */ -int irq_can_set_affinity(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !irqd_can_balance(&desc->irq_data) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - - return 1; -} - -/** - * irq_set_thread_affinity - Notify irq threads to adjust affinity - * @desc: irq descriptor which has affitnity changed - * - * We just set IRQTF_AFFINITY and delegate the affinity setting - * to the interrupt thread itself. We can not call - * set_cpus_allowed_ptr() here as we hold desc->lock and this - * code can be called from hard interrupt context. - */ -void irq_set_thread_affinity(struct irq_desc *desc) -{ - struct irqaction *action = desc->action; - - while (action) { - if (action->thread) - set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } -} - -#ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_data *data) -{ - return irqd_can_move_in_process_context(data); -} -static inline bool irq_move_pending(struct irq_data *data) -{ - return irqd_is_setaffinity_pending(data); -} -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) -{ - cpumask_copy(desc->pending_mask, mask); -} -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) -{ - cpumask_copy(mask, desc->pending_mask); -} -#else -static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } -static inline bool irq_move_pending(struct irq_data *data) { return false; } -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } -#endif - -int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) -{ - struct irq_chip *chip = irq_data_get_irq_chip(data); - struct irq_desc *desc = irq_data_to_desc(data); - int ret = 0; - - if (!chip || !chip->irq_set_affinity) - return -EINVAL; - - if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } - } else { - irqd_set_move_pending(data); - irq_copy_pending(desc, mask); - } - - if (desc->affinity_notify) { - kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); - } - irqd_set(data, IRQD_AFFINITY_SET); - - return ret; -} - -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @mask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - if (!desc) - return -EINVAL; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - desc->affinity_hint = m; - irq_put_desc_unlock(desc, flags); - return 0; -} -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); - -static void irq_affinity_notify(struct work_struct *work) -{ - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); - struct irq_desc *desc = irq_to_desc(notify->irq); - cpumask_var_t cpumask; - unsigned long flags; - - if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) - goto out; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(&desc->irq_data)) - irq_get_pending(cpumask, desc); - else - cpumask_copy(cpumask, desc->irq_data.affinity); - raw_spin_unlock_irqrestore(&desc->lock, flags); - - notify->notify(notify, cpumask); - - free_cpumask_var(cpumask); -out: - kref_put(¬ify->kref, notify->release); -} - -/** - * irq_set_affinity_notifier - control notification of IRQ affinity changes - * @irq: Interrupt for which to enable/disable notification - * @notify: Context for notification, or %NULL to disable - * notification. Function pointers must be initialised; - * the other fields will be initialised by this function. - * - * Must be called in process context. Notification may only be enabled - * after the IRQ is allocated and must be disabled before the IRQ is - * freed using free_irq(). - */ -int -irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irq_affinity_notify *old_notify; - unsigned long flags; - - /* The release function is promised process context */ - might_sleep(); - - if (!desc) - return -EINVAL; - - /* Complete initialisation of *notify */ - if (notify) { - notify->irq = irq; - kref_init(¬ify->kref); - INIT_WORK(¬ify->work, irq_affinity_notify); - } - - raw_spin_lock_irqsave(&desc->lock, flags); - old_notify = desc->affinity_notify; - desc->affinity_notify = notify; - raw_spin_unlock_irqrestore(&desc->lock, flags); - - if (old_notify) - kref_put(&old_notify->kref, old_notify->release); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); - -#ifndef CONFIG_AUTO_IRQ_AFFINITY -/* - * Generic version of the affinity autoselector. - */ -static int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct cpumask *set = irq_default_affinity; - int ret; - - /* Excludes PER_CPU and NO_BALANCE interrupts */ - if (!irq_can_set_affinity(irq)) - return 0; - - /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. - */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { - if (cpumask_intersects(desc->irq_data.affinity, - cpu_online_mask)) - set = desc->irq_data.affinity; - else - irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); - } - - cpumask_and(mask, cpu_online_mask, set); - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - return 0; -} -#else -static inline int -setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) -{ - return irq_select_affinity(irq); -} -#endif - -/* - * Called when affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc, mask); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -#else -static inline int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) -{ - return 0; -} -#endif - -void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) -{ - if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) - return; - desc->istate |= IRQS_SUSPENDED; - } - - if (!desc->depth++) - irq_disable(desc); -} - -static int __disable_irq_nosync(unsigned int irq) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - __disable_irq(desc, irq, false); - irq_put_desc_busunlock(desc, flags); - return 0; -} - -/** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. - * - * This function may be called from IRQ context. - */ -void disable_irq_nosync(unsigned int irq) -{ - __disable_irq_nosync(irq); -} -EXPORT_SYMBOL(disable_irq_nosync); - -/** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void disable_irq(unsigned int irq) -{ - if (!__disable_irq_nosync(irq)) - synchronize_irq(irq); -} -EXPORT_SYMBOL(disable_irq); - -void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) -{ - if (resume) { - if (!(desc->istate & IRQS_SUSPENDED)) { - if (!desc->action) - return; - if (!(desc->action->flags & IRQF_FORCE_RESUME)) - return; - /* Pretend that it got disabled ! */ - desc->depth++; - } - desc->istate &= ~IRQS_SUSPENDED; - } - - switch (desc->depth) { - case 0: - err_out: - WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - break; - case 1: { - if (desc->istate & IRQS_SUSPENDED) - goto err_out; - /* Prevent probing on this irq: */ - irq_settings_set_noprobe(desc); - irq_enable(desc); - check_irq_resend(desc, irq); - /* fall-through */ - } - default: - desc->depth--; - } -} - -/** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable - * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. - * - * This function may be called from IRQ context only when - * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! - */ -void enable_irq(unsigned int irq) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return; - if (WARN(!desc->irq_data.chip, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - goto out; - - __enable_irq(desc, irq, false); -out: - irq_put_desc_busunlock(desc, flags); -} -EXPORT_SYMBOL(enable_irq); - -static int set_irq_wake_real(unsigned int irq, unsigned int on) -{ - struct irq_desc *desc = irq_to_desc(irq); - int ret = -ENXIO; - - if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) - return 0; - - if (desc->irq_data.chip->irq_set_wake) - ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); - - return ret; -} - -/** - * irq_set_irq_wake - control irq power management wakeup - * @irq: interrupt to control - * @on: enable/disable power management wakeup - * - * Enable/disable power management wakeup mode, which is - * disabled by default. Enables and disables must match, - * just as they match for non-wakeup mode support. - * - * Wakeup mode lets this IRQ wake the system from sleep - * states like "suspend to RAM". - */ -int irq_set_irq_wake(unsigned int irq, unsigned int on) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; - - /* wakeup-capable irqs can be shared between drivers that - * don't need to have the same sleep mode behaviors. - */ - if (on) { - if (desc->wake_depth++ == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 0; - else - irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); - } - } else { - if (desc->wake_depth == 0) { - WARN(1, "Unbalanced IRQ %d wake disable\n", irq); - } else if (--desc->wake_depth == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 1; - else - irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); - } - } - irq_put_desc_busunlock(desc, flags); - return ret; -} -EXPORT_SYMBOL(irq_set_irq_wake); - -/* - * Internal function that tells the architecture code whether a - * particular irq has been exclusively allocated or is available - * for driver use. - */ -int can_request_irq(unsigned int irq, unsigned long irqflags) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - int canrequest = 0; - - if (!desc) - return 0; - - if (irq_settings_can_request(desc)) { - if (desc->action) - if (irqflags & desc->action->flags & IRQF_SHARED) - canrequest =1; - } - irq_put_desc_unlock(desc, flags); - return canrequest; -} - -int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) -{ - struct irq_chip *chip = desc->irq_data.chip; - int ret, unmask = 0; - - if (!chip || !chip->irq_set_type) { - /* - * IRQF_TRIGGER_* but the PIC does not support multiple - * flow-types? - */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, - chip ? (chip->name ? : "unknown") : "unknown"); - return 0; - } - - flags &= IRQ_TYPE_SENSE_MASK; - - if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { - if (!irqd_irq_masked(&desc->irq_data)) - mask_irq(desc); - if (!irqd_irq_disabled(&desc->irq_data)) - unmask = 1; - } - - /* caller masked out all except trigger mode flags */ - ret = chip->irq_set_type(&desc->irq_data, flags); - - switch (ret) { - case IRQ_SET_MASK_OK: - irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); - irqd_set(&desc->irq_data, flags); - - case IRQ_SET_MASK_OK_NOCOPY: - flags = irqd_get_trigger_type(&desc->irq_data); - irq_settings_set_trigger_mask(desc, flags); - irqd_clear(&desc->irq_data, IRQD_LEVEL); - irq_settings_clr_level(desc); - if (flags & IRQ_TYPE_LEVEL_MASK) { - irq_settings_set_level(desc); - irqd_set(&desc->irq_data, IRQD_LEVEL); - } - - ret = 0; - break; - default: - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); - } - if (unmask) - unmask_irq(desc); - return ret; -} - -/* - * Default primary interrupt handler for threaded interrupts. Is - * assigned as primary handler when request_threaded_irq is called - * with handler == NULL. Useful for oneshot interrupts. - */ -static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) -{ - return IRQ_WAKE_THREAD; -} - -/* - * Primary handler for nested threaded interrupts. Should never be - * called. - */ -static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) -{ - WARN(1, "Primary handler called for nested irq %d\n", irq); - return IRQ_NONE; -} - -static int irq_wait_for_interrupt(struct irqaction *action) -{ - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - - if (test_and_clear_bit(IRQTF_RUNTHREAD, - &action->thread_flags)) { - __set_current_state(TASK_RUNNING); - return 0; - } - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return -1; -} - -/* - * Oneshot interrupts keep the irq line masked until the threaded - * handler finished. unmask if the interrupt has not been disabled and - * is marked MASKED. - */ -static void irq_finalize_oneshot(struct irq_desc *desc, - struct irqaction *action, bool force) -{ - if (!(desc->istate & IRQS_ONESHOT)) - return; -again: - chip_bus_lock(desc); - raw_spin_lock_irq(&desc->lock); - - /* - * Implausible though it may be we need to protect us against - * the following scenario: - * - * The thread is faster done than the hard interrupt handler - * on the other CPU. If we unmask the irq line then the - * interrupt can come in again and masks the line, leaves due - * to IRQS_INPROGRESS and the irq line is masked forever. - * - * This also serializes the state of shared oneshot handlers - * versus "desc->threads_onehsot |= action->thread_mask;" in - * irq_wake_thread(). See the comment there which explains the - * serialization. - */ - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { - raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(desc); - cpu_relax(); - goto again; - } - - /* - * Now check again, whether the thread should run. Otherwise - * we would clear the threads_oneshot bit of this thread which - * was just set. - */ - if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - goto out_unlock; - - desc->threads_oneshot &= ~action->thread_mask; - - if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && - irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); - -out_unlock: - raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(desc); -} - -#ifdef CONFIG_SMP -/* - * Check whether we need to chasnge the affinity of the interrupt thread. - */ -static void -irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) -{ - cpumask_var_t mask; - - if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) - return; - - /* - * In case we are out of memory we set IRQTF_AFFINITY again and - * try again next time - */ - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { - set_bit(IRQTF_AFFINITY, &action->thread_flags); - return; - } - - raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->irq_data.affinity); - raw_spin_unlock_irq(&desc->lock); - - set_cpus_allowed_ptr(current, mask); - free_cpumask_var(mask); -} -#else -static inline void -irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } -#endif - -/* - * Interrupts which are not explicitely requested as threaded - * interrupts rely on the implicit bh/preempt disable of the hard irq - * context. So we need to disable bh here to avoid deadlocks and other - * side effects. - */ -static irqreturn_t -irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) -{ - irqreturn_t ret; - - local_bh_disable(); - ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); - local_bh_enable(); - return ret; -} - -/* - * Interrupts explicitely requested as threaded interupts want to be - * preemtible - many of them need to sleep and wait for slow busses to - * complete. - */ -static irqreturn_t irq_thread_fn(struct irq_desc *desc, - struct irqaction *action) -{ - irqreturn_t ret; - - ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); - return ret; -} - -/* - * Interrupt handler thread - */ -static int irq_thread(void *data) -{ - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - struct irqaction *action = data; - struct irq_desc *desc = irq_to_desc(action->irq); - irqreturn_t (*handler_fn)(struct irq_desc *desc, - struct irqaction *action); - int wake; - - if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, - &action->thread_flags)) - handler_fn = irq_forced_thread_fn; - else - handler_fn = irq_thread_fn; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irqaction = action; - - while (!irq_wait_for_interrupt(action)) { - - irq_thread_check_affinity(desc, action); - - atomic_inc(&desc->threads_active); - - raw_spin_lock_irq(&desc->lock); - if (unlikely(irqd_irq_disabled(&desc->irq_data))) { - /* - * CHECKME: We might need a dedicated - * IRQ_THREAD_PENDING flag here, which - * retriggers the thread in check_irq_resend() - * but AFAICT IRQS_PENDING should be fine as it - * retriggers the interrupt itself --- tglx - */ - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - } else { - irqreturn_t action_ret; - - raw_spin_unlock_irq(&desc->lock); - action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); - } - - wake = atomic_dec_and_test(&desc->threads_active); - - if (wake && waitqueue_active(&desc->wait_for_threads)) - wake_up(&desc->wait_for_threads); - } - - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action, true); - - /* - * Clear irqaction. Otherwise exit_irq_thread() would make - * fuzz about an active irq thread going into nirvana. - */ - current->irqaction = NULL; - return 0; -} - -/* - * Called from do_exit() - */ -void exit_irq_thread(void) -{ - struct task_struct *tsk = current; - struct irq_desc *desc; - - if (!tsk->irqaction) - return; - - printk(KERN_ERR - "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); - - desc = irq_to_desc(tsk->irqaction->irq); - - /* - * Prevent a stale desc->threads_oneshot. Must be called - * before setting the IRQTF_DIED flag. - */ - irq_finalize_oneshot(desc, tsk->irqaction, true); - - /* - * Set the THREAD DIED flag to prevent further wakeups of the - * soon to be gone threaded handler. - */ - set_bit(IRQTF_DIED, &tsk->irqaction->flags); -} - -static void irq_setup_forced_threading(struct irqaction *new) -{ - if (!force_irqthreads) - return; - if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) - return; - - new->flags |= IRQF_ONESHOT; - - if (!new->thread_fn) { - set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); - new->thread_fn = new->handler; - new->handler = irq_default_primary_handler; - } -} - -/* - * Internal function to register an irqaction - typically used to - * allocate special interrupts that are part of the architecture. - */ -static int -__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) -{ - struct irqaction *old, **old_ptr; - const char *old_name = NULL; - unsigned long flags, thread_mask = 0; - int ret, nested, shared = 0; - cpumask_var_t mask; - - if (!desc) - return -EINVAL; - - if (desc->irq_data.chip == &no_irq_chip) - return -ENOSYS; - if (!try_module_get(desc->owner)) - return -ENODEV; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } - - /* - * Check whether the interrupt nests into another interrupt - * thread. - */ - nested = irq_settings_is_nested_thread(desc); - if (nested) { - if (!new->thread_fn) { - ret = -EINVAL; - goto out_mput; - } - /* - * Replace the primary handler which was provided from - * the driver for non nested interrupt handling by the - * dummy function which warns when called. - */ - new->handler = irq_nested_primary_handler; - } else { - if (irq_settings_can_thread(desc)) - irq_setup_forced_threading(new); - } - - /* - * Create a handler thread when a thread function is supplied - * and the interrupt does not nest into another interrupt - * thread. - */ - if (new->thread_fn && !nested) { - struct task_struct *t; - - t = kthread_create(irq_thread, new, "irq/%d-%s", irq, - new->name); - if (IS_ERR(t)) { - ret = PTR_ERR(t); - goto out_mput; - } - /* - * We keep the reference to the task struct even if - * the thread dies to avoid that the interrupt code - * references an already freed task_struct. - */ - get_task_struct(t); - new->thread = t; - } - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out_thread; - } - - /* - * The following block of code has to be executed atomically - */ - raw_spin_lock_irqsave(&desc->lock, flags); - old_ptr = &desc->action; - old = *old_ptr; - if (old) { - /* - * Can't share interrupts unless both agree to and are - * the same type (level, edge, polarity). So both flag - * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. Also all must - * agree on ONESHOT. - */ - if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || - ((old->flags ^ new->flags) & IRQF_ONESHOT)) { - old_name = old->name; - goto mismatch; - } - - /* All handlers must agree on per-cpuness */ - if ((old->flags & IRQF_PERCPU) != - (new->flags & IRQF_PERCPU)) - goto mismatch; - - /* add new interrupt at end of irq queue */ - do { - /* - * Or all existing action->thread_mask bits, - * so we can find the next zero bit for this - * new action. - */ - thread_mask |= old->thread_mask; - old_ptr = &old->next; - old = *old_ptr; - } while (old); - shared = 1; - } - - /* - * Setup the thread mask for this irqaction for ONESHOT. For - * !ONESHOT irqs the thread mask is 0 so we can avoid a - * conditional in irq_wake_thread(). - */ - if (new->flags & IRQF_ONESHOT) { - /* - * Unlikely to have 32 resp 64 irqs sharing one line, - * but who knows. - */ - if (thread_mask == ~0UL) { - ret = -EBUSY; - goto out_mask; - } - /* - * The thread_mask for the action is or'ed to - * desc->thread_active to indicate that the - * IRQF_ONESHOT thread handler has been woken, but not - * yet finished. The bit is cleared when a thread - * completes. When all threads of a shared interrupt - * line have completed desc->threads_active becomes - * zero and the interrupt line is unmasked. See - * handle.c:irq_wake_thread() for further information. - * - * If no thread is woken by primary (hard irq context) - * interrupt handlers, then desc->threads_active is - * also checked for zero to unmask the irq line in the - * affected hard irq flow handlers - * (handle_[fasteoi|level]_irq). - * - * The new action gets the first zero bit of - * thread_mask assigned. See the loop above which or's - * all existing action->thread_mask bits. - */ - new->thread_mask = 1 << ffz(thread_mask); - } - - if (!shared) { - init_waitqueue_head(&desc->wait_for_threads); - - /* Setup the type (level, edge polarity) if configured: */ - if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, - new->flags & IRQF_TRIGGER_MASK); - - if (ret) - goto out_mask; - } - - desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_ONESHOT | IRQS_WAITING); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - - if (new->flags & IRQF_PERCPU) { - irqd_set(&desc->irq_data, IRQD_PER_CPU); - irq_settings_set_per_cpu(desc); - } - - if (new->flags & IRQF_ONESHOT) - desc->istate |= IRQS_ONESHOT; - - if (irq_settings_can_autoenable(desc)) - irq_startup(desc, true); - else - /* Undo nested disables: */ - desc->depth = 1; - - /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) { - irq_settings_set_no_balancing(desc); - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - } - - /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc, mask); - - } else if (new->flags & IRQF_TRIGGER_MASK) { - unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; - unsigned int omsk = irq_settings_get_trigger_mask(desc); - - if (nmsk != omsk) - /* hope the handler works with current trigger mode */ - pr_warning("IRQ %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); - } - - new->irq = irq; - *old_ptr = new; - - /* Reset broken irq detection when installing new handler */ - desc->irq_count = 0; - desc->irqs_unhandled = 0; - - /* - * Check whether we disabled the irq via the spurious handler - * before. Reenable it and give it another chance. - */ - if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { - desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq, false); - } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - /* - * Strictly no need to wake it up, but hung_task complains - * when no hard interrupt wakes the thread up. - */ - if (new->thread) - wake_up_process(new->thread); - - register_irq_proc(irq, desc); - new->dir = NULL; - register_handler_proc(irq, new); - free_cpumask_var(mask); - - return 0; - -mismatch: -#ifdef CONFIG_DEBUG_SHIRQ - if (!(new->flags & IRQF_PROBE_SHARED)) { - printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); - if (old_name) - printk(KERN_ERR "current handler: %s\n", old_name); - dump_stack(); - } -#endif - ret = -EBUSY; - -out_mask: - raw_spin_unlock_irqrestore(&desc->lock, flags); - free_cpumask_var(mask); - -out_thread: - if (new->thread) { - struct task_struct *t = new->thread; - - new->thread = NULL; - if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) - kthread_stop(t); - put_task_struct(t); - } -out_mput: - module_put(desc->owner); - return ret; -} - -/** - * setup_irq - setup an interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt - * - * Used to statically setup interrupts in the early boot process. - */ -int setup_irq(unsigned int irq, struct irqaction *act) -{ - int retval; - struct irq_desc *desc = irq_to_desc(irq); - - if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return -EINVAL; - chip_bus_lock(desc); - retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); - - return retval; -} -EXPORT_SYMBOL_GPL(setup_irq); - -/* - * Internal function to unregister an irqaction - used to free - * regular and special interrupts that are part of the architecture. - */ -static struct irqaction *__free_irq(unsigned int irq, void *dev_id) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action, **action_ptr; - unsigned long flags; - - WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); - - if (!desc) - return NULL; - - raw_spin_lock_irqsave(&desc->lock, flags); - - /* - * There can be multiple actions per IRQ descriptor, find the right - * one based on the dev_id: - */ - action_ptr = &desc->action; - for (;;) { - action = *action_ptr; - - if (!action) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return NULL; - } - - if (action->dev_id == dev_id) - break; - action_ptr = &action->next; - } - - /* Found it - now remove it from the list of entries: */ - *action_ptr = action->next; - - /* Currently used only by UML, might disappear one day: */ -#ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->irq_data.chip->release) - desc->irq_data.chip->release(irq, dev_id); -#endif - - /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) - irq_shutdown(desc); - -#ifdef CONFIG_SMP - /* make sure affinity_hint is cleaned up */ - if (WARN_ON_ONCE(desc->affinity_hint)) - desc->affinity_hint = NULL; -#endif - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - unregister_handler_proc(irq, action); - - /* Make sure it's not being used on another CPU: */ - synchronize_irq(irq); - -#ifdef CONFIG_DEBUG_SHIRQ - /* - * It's a shared IRQ -- the driver ought to be prepared for an IRQ - * event to happen even now it's being freed, so let's make sure that - * is so by doing an extra call to the handler .... - * - * ( We do this after actually deregistering it, to make sure that a - * 'real' IRQ doesn't run in * parallel with our fake. ) - */ - if (action->flags & IRQF_SHARED) { - local_irq_save(flags); - action->handler(irq, dev_id); - local_irq_restore(flags); - } -#endif - - if (action->thread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(action->thread); - put_task_struct(action->thread); - } - - module_put(desc->owner); - return action; -} - -/** - * remove_irq - free an interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt - * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); -} -EXPORT_SYMBOL_GPL(remove_irq); - -/** - * free_irq - free an interrupt allocated with request_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function must not be called from interrupt context. - */ -void free_irq(unsigned int irq, void *dev_id) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return; - -#ifdef CONFIG_SMP - if (WARN_ON(desc->affinity_notify)) - desc->affinity_notify = NULL; -#endif - - chip_bus_lock(desc); - kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(desc); -} -EXPORT_SYMBOL(free_irq); - -/** - * request_threaded_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts - * If NULL and thread_fn != NULL the default - * primary handler is installed - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler is - * still called in hard interrupt context and has to check - * whether the interrupt originates from the device. If yes it - * needs to disable the interrupt on the device and return - * IRQ_WAKE_THREAD which will wake up the handler thread and run - * @thread_fn. This split handler design is necessary to support - * shared interrupts. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: - * - * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy - * IRQF_TRIGGER_* Specify active edge(s) or level - * - */ -int request_threaded_irq(unsigned int irq, irq_handler_t handler, - irq_handler_t thread_fn, unsigned long irqflags, - const char *devname, void *dev_id) -{ - struct irqaction *action; - struct irq_desc *desc; - int retval; - - /* - * Sanity-check: shared interrupts must pass in a real dev-ID, - * otherwise we'll have trouble later trying to figure out - * which interrupt is which (messes up the interrupt freeing - * logic etc). - */ - if ((irqflags & IRQF_SHARED) && !dev_id) - return -EINVAL; - - desc = irq_to_desc(irq); - if (!desc) - return -EINVAL; - - if (!irq_settings_can_request(desc) || - WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return -EINVAL; - - if (!handler) { - if (!thread_fn) - return -EINVAL; - handler = irq_default_primary_handler; - } - - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); - if (!action) - return -ENOMEM; - - action->handler = handler; - action->thread_fn = thread_fn; - action->flags = irqflags; - action->name = devname; - action->dev_id = dev_id; - - chip_bus_lock(desc); - retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); - - if (retval) - kfree(action); - -#ifdef CONFIG_DEBUG_SHIRQ_FIXME - if (!retval && (irqflags & IRQF_SHARED)) { - /* - * It's a shared IRQ -- the driver ought to be prepared for it - * to happen immediately, so let's make sure.... - * We disable the irq to make sure that a 'real' IRQ doesn't - * run in parallel with our fake. - */ - unsigned long flags; - - disable_irq(irq); - local_irq_save(flags); - - handler(irq, dev_id); - - local_irq_restore(flags); - enable_irq(irq); - } -#endif - return retval; -} -EXPORT_SYMBOL(request_threaded_irq); - -/** - * request_any_context_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @flags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It selects either a - * hardirq or threaded handling method depending on the - * context. - * - * On failure, it returns a negative value. On success, - * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. - */ -int request_any_context_irq(unsigned int irq, irq_handler_t handler, - unsigned long flags, const char *name, void *dev_id) -{ - struct irq_desc *desc = irq_to_desc(irq); - int ret; - - if (!desc) - return -EINVAL; - - if (irq_settings_is_nested_thread(desc)) { - ret = request_threaded_irq(irq, NULL, handler, - flags, name, dev_id); - return !ret ? IRQC_IS_NESTED : ret; - } - - ret = request_irq(irq, handler, flags, name, dev_id); - return !ret ? IRQC_IS_HARDIRQ : ret; -} -EXPORT_SYMBOL_GPL(request_any_context_irq); - -void enable_percpu_irq(unsigned int irq, unsigned int type) -{ - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - type &= IRQ_TYPE_SENSE_MASK; - if (type != IRQ_TYPE_NONE) { - int ret; - - ret = __irq_set_trigger(desc, irq, type); - - if (ret) { - WARN(1, "failed to set type for IRQ%d\n", irq); - goto out; - } - } - - irq_percpu_enable(desc, cpu); -out: - irq_put_desc_unlock(desc, flags); -} - -void disable_percpu_irq(unsigned int irq) -{ - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - irq_percpu_disable(desc, cpu); - irq_put_desc_unlock(desc, flags); -} - -/* - * Internal function to unregister a percpu irqaction. - */ -static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned long flags; - - WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); - - if (!desc) - return NULL; - - raw_spin_lock_irqsave(&desc->lock, flags); - - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - goto bad; - } - - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); - goto bad; - } - - /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - unregister_handler_proc(irq, action); - - module_put(desc->owner); - return action; - -bad: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return NULL; -} - -/** - * remove_percpu_irq - free a per-cpu interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt - * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && irq_settings_is_per_cpu_devid(desc)) - __free_percpu_irq(irq, act->percpu_dev_id); -} - -/** - * free_percpu_irq - free an interrupt allocated with request_percpu_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Remove a percpu interrupt handler. The handler is removed, but - * the interrupt line is not disabled. This must be done on each - * CPU before calling this function. The function does not return - * until any executing interrupts for this IRQ have completed. - * - * This function must not be called from interrupt context. - */ -void free_percpu_irq(unsigned int irq, void __percpu *dev_id) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !irq_settings_is_per_cpu_devid(desc)) - return; - - chip_bus_lock(desc); - kfree(__free_percpu_irq(irq, dev_id)); - chip_bus_sync_unlock(desc); -} - -/** - * setup_percpu_irq - setup a per-cpu interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt - * - * Used to statically setup per-cpu interrupts in the early boot process. - */ -int setup_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - int retval; - - if (!desc || !irq_settings_is_per_cpu_devid(desc)) - return -EINVAL; - chip_bus_lock(desc); - retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); - - return retval; -} - -/** - * request_percpu_irq - allocate a percpu interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @devname: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function - * - * This call allocates interrupt resources, but doesn't - * automatically enable the interrupt. It has to be done on each - * CPU using enable_percpu_irq(). - * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. - */ -int request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev_id) -{ - struct irqaction *action; - struct irq_desc *desc; - int retval; - - if (!dev_id) - return -EINVAL; - - desc = irq_to_desc(irq); - if (!desc || !irq_settings_can_request(desc) || - !irq_settings_is_per_cpu_devid(desc)) - return -EINVAL; - - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); - if (!action) - return -ENOMEM; - - action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; - action->name = devname; - action->percpu_dev_id = dev_id; - - chip_bus_lock(desc); - retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); - - if (retval) - kfree(action); - - return retval; -} - -#include -#include - -#include "internals.h" - -void irq_move_masked_irq(struct irq_data *idata) -{ - struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = idata->chip; - - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) - return; - - /* - * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. - */ - if (!irqd_can_balance(&desc->irq_data)) { - WARN_ON(1); - return; - } - - irqd_clr_move_pending(&desc->irq_data); - - if (unlikely(cpumask_empty(desc->pending_mask))) - return; - - if (!chip->irq_set_affinity) - return; - - assert_raw_spin_locked(&desc->lock); - - /* - * If there was a valid mask to work with, please - * do the disable, re-program, enable sequence. - * This is *not* particularly important for level triggered - * but in a edge trigger case, we might be setting rte - * when an active trigger is coming in. This could - * cause some ioapics to mal-function. - * Being paranoid i guess! - * - * For correct operation this depends on the caller - * masking the irqs. - */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) - if (!chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false)) { - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - irq_set_thread_affinity(desc); - } - - cpumask_clear(desc->pending_mask); -} - -void irq_move_irq(struct irq_data *idata) -{ - bool masked; - - if (likely(!irqd_is_setaffinity_pending(idata))) - return; - - if (unlikely(irqd_irq_disabled(idata))) - return; - - /* - * Be careful vs. already masked interrupts. If this is a - * threaded interrupt with ONESHOT set, we can end up with an - * interrupt storm. - */ - masked = irqd_irq_masked(idata); - if (!masked) - idata->chip->irq_mask(idata); - irq_move_masked_irq(idata); - if (!masked) - idata->chip->irq_unmask(idata); -} -/* - * linux/kernel/irq/pm.c - * - * Copyright (C) 2009 Rafael J. Wysocki , Novell Inc. - * - * This file contains power management functions related to interrupts. - */ - -#include -#include -#include -#include - -#include "internals.h" - -/** - * suspend_device_irqs - disable all currently enabled interrupt lines - * - * During system-wide suspend or hibernation device drivers need to be prevented - * from receiving interrupts and this function is provided for this purpose. - * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQS_SUSPENDED flag for each of them. - */ -void suspend_device_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, true); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - for_each_irq_desc(irq, desc) - if (desc->istate & IRQS_SUSPENDED) - synchronize_irq(irq); -} -EXPORT_SYMBOL_GPL(suspend_device_irqs); - -static void resume_irqs(bool want_early) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - unsigned long flags; - bool is_early = desc->action && - desc->action->flags & IRQF_EARLY_RESUME; - - if (is_early != want_early) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq, true); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } -} - -/** - * irq_pm_syscore_ops - enable interrupt lines early - * - * Enable all interrupt lines with %IRQF_EARLY_RESUME set. - */ -static void irq_pm_syscore_resume(void) -{ - resume_irqs(true); -} - -static struct syscore_ops irq_pm_syscore_ops = { - .resume = irq_pm_syscore_resume, -}; - -static int __init irq_pm_init_ops(void) -{ - register_syscore_ops(&irq_pm_syscore_ops); - return 0; -} - -device_initcall(irq_pm_init_ops); - -/** - * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() - * - * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously - * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag - * set as well as those with %IRQF_FORCE_RESUME. - */ -void resume_device_irqs(void) -{ - resume_irqs(false); -} -EXPORT_SYMBOL_GPL(resume_device_irqs); - -/** - * check_wakeup_irqs - check if any wake-up interrupts are pending - */ -int check_wakeup_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->istate & IRQS_PENDING) - return -EBUSY; - continue; - } - /* - * Check the non wakeup interrupts whether they need - * to be masked before finally going into suspend - * state. That's for hardware which has no wakeup - * source configuration facility. The chip - * implementation indicates that with - * IRQCHIP_MASK_ON_SUSPEND. - */ - if (desc->istate & IRQS_SUSPENDED && - irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) - mask_irq(desc); - } - - return 0; -} -/* - * linux/kernel/irq/proc.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains the /proc/irq/ handling code. - */ - -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -static struct proc_dir_entry *root_irq_dir; - -#ifdef CONFIG_SMP - -static int show_irq_affinity(int type, struct seq_file *m, void *v) -{ - struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_data.affinity; - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; -#endif - if (type) - seq_cpumask_list(m, mask); - else - seq_cpumask(m, mask); - seq_putc(m, '\n'); - return 0; -} - -static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) -{ - struct irq_desc *desc = irq_to_desc((long)m->private); - unsigned long flags; - cpumask_var_t mask; - - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->affinity_hint) - cpumask_copy(mask, desc->affinity_hint); - raw_spin_unlock_irqrestore(&desc->lock, flags); - - seq_cpumask(m, mask); - seq_putc(m, '\n'); - free_cpumask_var(mask); - - return 0; -} - -#ifndef is_affinity_mask_valid -#define is_affinity_mask_valid(val) 1 -#endif - -int no_irq_affinity; -static int irq_affinity_proc_show(struct seq_file *m, void *v) -{ - return show_irq_affinity(0, m, v); -} - -static int irq_affinity_list_proc_show(struct seq_file *m, void *v) -{ - return show_irq_affinity(1, m, v); -} - - -static ssize_t write_irq_affinity(int type, struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; - cpumask_var_t new_value; - int err; - - if (!irq_can_set_affinity(irq) || no_irq_affinity) - return -EIO; - - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - if (type) - err = cpumask_parselist_user(buffer, count, new_value); - else - err = cpumask_parse_user(buffer, count, new_value); - if (err) - goto free_cpumask; - - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto free_cpumask; - } - - /* - * Do not allow disabling IRQs completely - it's a too easy - * way to make the system unusable accidentally :-) At least - * one online CPU still has to be targeted. - */ - if (!cpumask_intersects(new_value, cpu_online_mask)) { - /* Special case for empty set - allow the architecture - code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; - } else { - irq_set_affinity(irq, new_value); - err = count; - } - -free_cpumask: - free_cpumask_var(new_value); - return err; -} - -static ssize_t irq_affinity_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - return write_irq_affinity(0, file, buffer, count, pos); -} - -static ssize_t irq_affinity_list_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - return write_irq_affinity(1, file, buffer, count, pos); -} - -static int irq_affinity_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_proc_show, PDE(inode)->data); -} - -static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); -} - -static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); -} - -static const struct file_operations irq_affinity_proc_fops = { - .open = irq_affinity_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = irq_affinity_proc_write, -}; - -static const struct file_operations irq_affinity_hint_proc_fops = { - .open = irq_affinity_hint_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations irq_affinity_list_proc_fops = { - .open = irq_affinity_list_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = irq_affinity_list_proc_write, -}; - -static int default_affinity_show(struct seq_file *m, void *v) -{ - seq_cpumask(m, irq_default_affinity); - seq_putc(m, '\n'); - return 0; -} - -static ssize_t default_affinity_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos) -{ - cpumask_var_t new_value; - int err; - - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(buffer, count, new_value); - if (err) - goto out; - - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto out; - } - - /* - * Do not allow disabling IRQs completely - it's a too easy - * way to make the system unusable accidentally :-) At least - * one online CPU still has to be targeted. - */ - if (!cpumask_intersects(new_value, cpu_online_mask)) { - err = -EINVAL; - goto out; - } - - cpumask_copy(irq_default_affinity, new_value); - err = count; - -out: - free_cpumask_var(new_value); - return err; -} - -static int default_affinity_open(struct inode *inode, struct file *file) -{ - return single_open(file, default_affinity_show, PDE(inode)->data); -} - -static const struct file_operations default_affinity_proc_fops = { - .open = default_affinity_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = default_affinity_write, -}; - -static int irq_node_proc_show(struct seq_file *m, void *v) -{ - struct irq_desc *desc = irq_to_desc((long) m->private); - - seq_printf(m, "%d\n", desc->irq_data.node); - return 0; -} - -static int irq_node_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_node_proc_show, PDE(inode)->data); -} - -static const struct file_operations irq_node_proc_fops = { - .open = irq_node_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -static int irq_spurious_proc_show(struct seq_file *m, void *v) -{ - struct irq_desc *desc = irq_to_desc((long) m->private); - - seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - desc->irq_count, desc->irqs_unhandled, - jiffies_to_msecs(desc->last_unhandled)); - return 0; -} - -static int irq_spurious_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_spurious_proc_show, PDE(inode)->data); -} - -static const struct file_operations irq_spurious_proc_fops = { - .open = irq_spurious_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned long flags; - int ret = 1; - - raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc->dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, desc->dir); -} - -#undef MAX_NAMELEN - -#define MAX_NAMELEN 10 - -void register_irq_proc(unsigned int irq, struct irq_desc *desc) -{ - char name [MAX_NAMELEN]; - - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) - return; - - memset(name, 0, MAX_NAMELEN); - sprintf(name, "%d", irq); - - /* create /proc/irq/1234 */ - desc->dir = proc_mkdir(name, root_irq_dir); - if (!desc->dir) - return; - -#ifdef CONFIG_SMP - /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, desc->dir, - &irq_affinity_proc_fops, (void *)(long)irq); - - /* create /proc/irq//affinity_hint */ - proc_create_data("affinity_hint", 0400, desc->dir, - &irq_affinity_hint_proc_fops, (void *)(long)irq); - - /* create /proc/irq//smp_affinity_list */ - proc_create_data("smp_affinity_list", 0600, desc->dir, - &irq_affinity_list_proc_fops, (void *)(long)irq); - - proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, (void *)(long)irq); -#endif - - proc_create_data("spurious", 0444, desc->dir, - &irq_spurious_proc_fops, (void *)(long)irq); -} - -void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) -{ - char name [MAX_NAMELEN]; - - if (!root_irq_dir || !desc->dir) - return; -#ifdef CONFIG_SMP - remove_proc_entry("smp_affinity", desc->dir); - remove_proc_entry("affinity_hint", desc->dir); - remove_proc_entry("smp_affinity_list", desc->dir); - remove_proc_entry("node", desc->dir); -#endif - remove_proc_entry("spurious", desc->dir); - - memset(name, 0, MAX_NAMELEN); - sprintf(name, "%u", irq); - remove_proc_entry(name, root_irq_dir); -} - -#undef MAX_NAMELEN - -void unregister_handler_proc(unsigned int irq, struct irqaction *action) -{ - if (action->dir) { - struct irq_desc *desc = irq_to_desc(irq); - - remove_proc_entry(action->dir->name, desc->dir); - } -} - -static void register_default_affinity_proc(void) -{ -#ifdef CONFIG_SMP - proc_create("irq/default_smp_affinity", 0600, NULL, - &default_affinity_proc_fops); -#endif -} - -void init_irq_proc(void) -{ - unsigned int irq; - struct irq_desc *desc; - - /* create /proc/irq */ - root_irq_dir = proc_mkdir("irq", NULL); - if (!root_irq_dir) - return; - - register_default_affinity_proc(); - - /* - * Create entries for all existing IRQs. - */ - for_each_irq_desc(irq, desc) { - if (!desc) - continue; - - register_irq_proc(irq, desc); - } -} - -#ifdef CONFIG_GENERIC_IRQ_SHOW - -int __weak arch_show_interrupts(struct seq_file *p, int prec) -{ - return 0; -} - -#ifndef ACTUAL_NR_IRQS -# define ACTUAL_NR_IRQS nr_irqs -#endif - -int show_interrupts(struct seq_file *p, void *v) -{ - static int prec; - - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; - - if (i > ACTUAL_NR_IRQS) - return 0; - - if (i == ACTUAL_NR_IRQS) - return arch_show_interrupts(p, prec); - - /* print header and calculate the width of the first column */ - if (i == 0) { - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; - - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - - if (desc->irq_data.chip) { - if (desc->irq_data.chip->irq_print_chip) - desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); - else if (desc->irq_data.chip->name) - seq_printf(p, " %8s", desc->irq_data.chip->name); - else - seq_printf(p, " %8s", "-"); - } else { - seq_printf(p, " %8s", "None"); - } -#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL - seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); -#endif - if (desc->name) - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} -#endif -/* - * linux/kernel/irq/resend.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner - * - * This file contains the IRQ-resend code - * - * If the interrupt is waiting to be processed, we try to re-run it. - * We can't directly run it from here since the caller might be in an - * interrupt-protected region. Not all irq controller chips can - * retrigger interrupts at the hardware level, so in those cases - * we allow the resending of IRQs via a tasklet. - */ - -#include -#include -#include -#include - -#include "internals.h" - -#ifdef CONFIG_HARDIRQS_SW_RESEND - -/* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); - -/* - * Run software resends of IRQ's - */ -static void resend_irqs(unsigned long arg) -{ - struct irq_desc *desc; - int irq; - - while (!bitmap_empty(irqs_resend, nr_irqs)) { - irq = find_first_bit(irqs_resend, nr_irqs); - clear_bit(irq, irqs_resend); - desc = irq_to_desc(irq); - local_irq_disable(); - desc->handle_irq(irq, desc); - local_irq_enable(); - } -} - -/* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); - -#endif - -/* - * IRQ resend - * - * Is called with interrupts disabled and desc->lock held. - */ -void check_irq_resend(struct irq_desc *desc, unsigned int irq) -{ - /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. - */ - if (irq_settings_is_level(desc)) - return; - if (desc->istate & IRQS_REPLAY) - return; - if (desc->istate & IRQS_PENDING) { - desc->istate &= ~IRQS_PENDING; - desc->istate |= IRQS_REPLAY; - - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { -#ifdef CONFIG_HARDIRQS_SW_RESEND - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); - tasklet_schedule(&resend_tasklet); -#endif - } - } -} -/* - * linux/kernel/irq/spurious.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains spurious interrupt handling. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "internals.h" - -static int irqfixup __read_mostly; - -#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) -static void poll_spurious_irqs(unsigned long dummy); -static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); -static int irq_poll_cpu; -static atomic_t irq_poll_active; - -/* - * We wait here for a poller to finish. - * - * If the poll runs on this CPU, then we yell loudly and return - * false. That will leave the interrupt line disabled in the worst - * case, but it should never happen. - * - * We wait until the poller is done and then recheck disabled and - * action (about to be disabled). Only if it's still active, we return - * true and let the handler run. - */ -bool irq_wait_for_poll(struct irq_desc *desc) -{ - if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), - "irq poll in progress on cpu %d for irq %d\n", - smp_processor_id(), desc->irq_data.irq)) - return false; - -#ifdef CONFIG_SMP - do { - raw_spin_unlock(&desc->lock); - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - raw_spin_lock(&desc->lock); - } while (irqd_irq_inprogress(&desc->irq_data)); - /* Might have been disabled in meantime */ - return !irqd_irq_disabled(&desc->irq_data) && desc->action; -#else - return false; -#endif -} - - -/* - * Recovery handler for misrouted interrupts. - */ -static int try_one_irq(int irq, struct irq_desc *desc, bool force) -{ - irqreturn_t ret = IRQ_NONE; - struct irqaction *action; - - raw_spin_lock(&desc->lock); - - /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) - goto out; - - /* - * Do not poll disabled interrupts unless the spurious - * disabled poller asks explicitely. - */ - if (irqd_irq_disabled(&desc->irq_data) && !force) - goto out; - - /* - * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. - */ - action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || - (action->handler(irq, action->dev_id) == IRQ_HANDLED) || - !action->next) - goto out; - - /* Already running on another processor */ - if (irqd_irq_inprogress(&desc->irq_data)) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - desc->istate |= IRQS_PENDING; - goto out; - } - - /* Mark it poll in progress */ - desc->istate |= IRQS_POLL_INPROGRESS; - do { - if (handle_irq_event(desc) == IRQ_HANDLED) - ret = IRQ_HANDLED; - action = desc->action; - } while ((desc->istate & IRQS_PENDING) && action); - desc->istate &= ~IRQS_POLL_INPROGRESS; -out: - raw_spin_unlock(&desc->lock); - return ret == IRQ_HANDLED; -} - -static int misrouted_irq(int irq) -{ - struct irq_desc *desc; - int i, ok = 0; - - if (atomic_inc_return(&irq_poll_active) != 1) - goto out; - - irq_poll_cpu = smp_processor_id(); - - for_each_irq_desc(i, desc) { - if (!i) - continue; - - if (i == irq) /* Already tried */ - continue; - - if (try_one_irq(i, desc, false)) - ok = 1; - } -out: - atomic_dec(&irq_poll_active); - /* So the caller can adjust the irq error counts */ - return ok; -} - -static void poll_spurious_irqs(unsigned long dummy) -{ - struct irq_desc *desc; - int i; - - if (atomic_inc_return(&irq_poll_active) != 1) - goto out; - irq_poll_cpu = smp_processor_id(); - - for_each_irq_desc(i, desc) { - unsigned int state; - - if (!i) - continue; - - /* Racy but it doesn't matter */ - state = desc->istate; - barrier(); - if (!(state & IRQS_SPURIOUS_DISABLED)) - continue; - - local_irq_disable(); - try_one_irq(i, desc, true); - local_irq_enable(); - } -out: - atomic_dec(&irq_poll_active); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); -} - -static inline int bad_action_ret(irqreturn_t action_ret) -{ - if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) - return 0; - return 1; -} - -/* - * If 99,900 of the previous 100,000 interrupts have not been handled - * then assume that the IRQ is stuck in some manner. Drop a diagnostic - * and try to turn the IRQ off. - * - * (The other 100-of-100,000 interrupts may have been a correctly - * functioning device sharing an IRQ with the failing one) - */ -static void -__report_bad_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) -{ - struct irqaction *action; - unsigned long flags; - - if (bad_action_ret(action_ret)) { - printk(KERN_ERR "irq event %d: bogus return value %x\n", - irq, action_ret); - } else { - printk(KERN_ERR "irq %d: nobody cared (try booting with " - "the \"irqpoll\" option)\n", irq); - } - dump_stack(); - printk(KERN_ERR "handlers:\n"); - - /* - * We need to take desc->lock here. note_interrupt() is called - * w/o desc->lock held, but IRQ_PROGRESS set. We might race - * with something else removing an action. It's ok to take - * desc->lock here. See synchronize_irq(). - */ - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); - if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", - action->thread_fn, action->thread_fn); - printk(KERN_CONT "\n"); - action = action->next; - } - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -static void -report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) -{ - static int count = 100; - - if (count > 0) { - count--; - __report_bad_irq(irq, desc, action_ret); - } -} - -static inline int -try_misrouted_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) -{ - struct irqaction *action; - - if (!irqfixup) - return 0; - - /* We didn't actually handle the IRQ - see if it was misrouted? */ - if (action_ret == IRQ_NONE) - return 1; - - /* - * But for 'irqfixup == 2' we also do it for handled interrupts if - * they are marked as IRQF_IRQPOLL (or for irq zero, which is the - * traditional PC timer interrupt.. Legacy) - */ - if (irqfixup < 2) - return 0; - - if (!irq) - return 1; - - /* - * Since we don't get the descriptor lock, "action" can - * change under us. We don't really care, but we don't - * want to follow a NULL pointer. So tell the compiler to - * just load it once by using a barrier. - */ - action = desc->action; - barrier(); - return action && (action->flags & IRQF_IRQPOLL); -} - -void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) -{ - if (desc->istate & IRQS_POLL_INPROGRESS) - return; - - /* we get here again via the threaded handler */ - if (action_ret == IRQ_WAKE_THREAD) - return; - - if (bad_action_ret(action_ret)) { - report_bad_irq(irq, desc, action_ret); - return; - } - - if (unlikely(action_ret == IRQ_NONE)) { - /* - * If we are seeing only the odd spurious IRQ caused by - * bus asynchronicity then don't eventually trigger an error, - * otherwise the counter becomes a doomsday timer for otherwise - * working systems - */ - if (time_after(jiffies, desc->last_unhandled + HZ/10)) - desc->irqs_unhandled = 1; - else - desc->irqs_unhandled++; - desc->last_unhandled = jiffies; - } - - if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { - int ok = misrouted_irq(irq); - if (action_ret == IRQ_NONE) - desc->irqs_unhandled -= ok; - } - - desc->irq_count++; - if (likely(desc->irq_count < 100000)) - return; - - desc->irq_count = 0; - if (unlikely(desc->irqs_unhandled > 99900)) { - /* - * The interrupt is stuck - */ - __report_bad_irq(irq, desc, action_ret); - /* - * Now kill the IRQ - */ - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->istate |= IRQS_SPURIOUS_DISABLED; - desc->depth++; - irq_disable(desc); - - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); - } - desc->irqs_unhandled = 0; -} - -bool noirqdebug __read_mostly; - -int noirqdebug_setup(char *str) -{ - noirqdebug = 1; - printk(KERN_INFO "IRQ lockup detection disabled\n"); - - return 1; -} - -__setup("noirqdebug", noirqdebug_setup); -module_param(noirqdebug, bool, 0644); -MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); - -static int __init irqfixup_setup(char *str) -{ - irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); - - return 1; -} - -__setup("irqfixup", irqfixup_setup); -module_param(irqfixup, int, 0644); - -static int __init irqpoll_setup(char *str) -{ - irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); - printk(KERN_WARNING "This may significantly impact system " - "performance\n"); - return 1; -} - -__setup("irqpoll", irqpoll_setup); -/* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra - * - * Provides a framework for enqueueing and running callbacks from hardirq - * context. The enqueueing is NMI-safe. - */ - -#include -#include -#include -#include -#include -#include - -/* - * An entry can be in one of four states: - * - * free NULL, 0 -> {claimed} : free to be used - * claimed NULL, 3 -> {pending} : claimed to be enqueued - * pending next, 3 -> {busy} : queued, pending callback - * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - */ - -#define IRQ_WORK_PENDING 1UL -#define IRQ_WORK_BUSY 2UL -#define IRQ_WORK_FLAGS 3UL - -static DEFINE_PER_CPU(struct llist_head, irq_work_list); - -/* - * Claim the entry so that no one else will poke at it. - */ -static bool irq_work_claim(struct irq_work *work) -{ - unsigned long flags, nflags; - - for (;;) { - flags = work->flags; - if (flags & IRQ_WORK_PENDING) - return false; - nflags = flags | IRQ_WORK_FLAGS; - if (cmpxchg(&work->flags, flags, nflags) == flags) - break; - cpu_relax(); - } - - return true; -} - -void __weak arch_irq_work_raise(void) -{ - /* - * Lame architectures will get the timer tick callback - */ -} - -/* - * Queue the entry and raise the IPI if needed. - */ -static void __irq_work_queue(struct irq_work *work) -{ - bool empty; - - preempt_disable(); - - empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - /* The list was empty, raise self-interrupt to start processing. */ - if (empty) - arch_irq_work_raise(); - - preempt_enable(); -} - -/* - * Enqueue the irq_work @entry, returns true on success, failure when the - * @entry was already enqueued by someone else. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue(struct irq_work *work) -{ - if (!irq_work_claim(work)) { - /* - * Already enqueued, can't do! - */ - return false; - } - - __irq_work_queue(work); - return true; -} -EXPORT_SYMBOL_GPL(irq_work_queue); - -/* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. - */ -void irq_work_run(void) -{ - struct irq_work *work; - struct llist_head *this_list; - struct llist_node *llnode; - - this_list = &__get_cpu_var(irq_work_list); - if (llist_empty(this_list)) - return; - - BUG_ON(!in_irq()); - BUG_ON(!irqs_disabled()); - - llnode = llist_del_all(this_list); - while (llnode != NULL) { - work = llist_entry(llnode, struct irq_work, llnode); - - llnode = llist_next(llnode); - - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - */ - work->flags = IRQ_WORK_BUSY; - work->func(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); - } -} -EXPORT_SYMBOL_GPL(irq_work_run); - -/* - * Synchronize against the irq_work @entry, ensures the entry is not - * currently in use. - */ -void irq_work_sync(struct irq_work *work) -{ - WARN_ON_ONCE(irqs_disabled()); - - while (work->flags & IRQ_WORK_BUSY) - cpu_relax(); -} -EXPORT_SYMBOL_GPL(irq_work_sync); -/* - * linux/kernel/itimer.c - * - * Copyright (C) 1992 Darren Senn - */ - -/* These are all the functions necessary to implement itimers */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -/** - * itimer_get_remtime - get remaining time for the timer - * - * @timer: the timer to read - * - * Returns the delta between the expiry time and now, which can be - * less than zero or 1usec for an pending expired timer - */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) -{ - ktime_t rem = hrtimer_get_remaining(timer); - - /* - * Racy but safe: if the itimer expires after the above - * hrtimer_get_remtime() call but before this condition - * then we return 0 - which is correct. - */ - if (hrtimer_active(timer)) { - if (rem.tv64 <= 0) - rem.tv64 = NSEC_PER_USEC; - } else - rem.tv64 = 0; - - return ktime_to_timeval(rem); -} - -static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) -{ - cputime_t cval, cinterval; - struct cpu_itimer *it = &tsk->signal->it[clock_id]; - - spin_lock_irq(&tsk->sighand->siglock); - - cval = it->expires; - cinterval = it->incr; - if (cval) { - struct task_cputime cputime; - cputime_t t; - - thread_group_cputimer(tsk, &cputime); - if (clock_id == CPUCLOCK_PROF) - t = cputime.utime + cputime.stime; - else - /* CPUCLOCK_VIRT */ - t = cputime.utime; - - if (cval < t) - /* about to fire */ - cval = cputime_one_jiffy; - else - cval = cval - t; - } - - spin_unlock_irq(&tsk->sighand->siglock); - - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); -} - -int do_getitimer(int which, struct itimerval *value) -{ - struct task_struct *tsk = current; - - switch (which) { - case ITIMER_REAL: - spin_lock_irq(&tsk->sighand->siglock); - value->it_value = itimer_get_remtime(&tsk->signal->real_timer); - value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); - break; - case ITIMER_PROF: - get_cpu_itimer(tsk, CPUCLOCK_PROF, value); - break; - default: - return(-EINVAL); - } - return 0; -} - -SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) -{ - int error = -EFAULT; - struct itimerval get_buffer; - - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } - return error; -} - - -/* - * The timer is automagically restarted, when interval != 0 - */ -enum hrtimer_restart it_real_fn(struct hrtimer *timer) -{ - struct signal_struct *sig = - container_of(timer, struct signal_struct, real_timer); - - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); - - return HRTIMER_NORESTART; -} - -static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) -{ - struct timespec ts; - s64 cpu_ns; - - cputime_to_timespec(ct, &ts); - cpu_ns = timespec_to_ns(&ts); - - return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; -} - -static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) -{ - cputime_t cval, nval, cinterval, ninterval; - s64 ns_ninterval, ns_nval; - u32 error, incr_error; - struct cpu_itimer *it = &tsk->signal->it[clock_id]; - - nval = timeval_to_cputime(&value->it_value); - ns_nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - ns_ninterval = timeval_to_ns(&value->it_interval); - - error = cputime_sub_ns(nval, ns_nval); - incr_error = cputime_sub_ns(ninterval, ns_ninterval); - - spin_lock_irq(&tsk->sighand->siglock); - - cval = it->expires; - cinterval = it->incr; - if (cval || nval) { - if (nval > 0) - nval += cputime_one_jiffy; - set_process_cpu_timer(tsk, clock_id, &nval, &cval); - } - it->expires = nval; - it->incr = ninterval; - it->error = error; - it->incr_error = incr_error; - trace_itimer_state(clock_id == CPUCLOCK_VIRT ? - ITIMER_VIRTUAL : ITIMER_PROF, value, nval); - - spin_unlock_irq(&tsk->sighand->siglock); - - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } -} - -/* - * Returns true if the timeval is in canonical form - */ -#define timeval_valid(t) \ - (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) - -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) -{ - struct task_struct *tsk = current; - struct hrtimer *timer; - ktime_t expires; - - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - - switch (which) { - case ITIMER_REAL: -again: - spin_lock_irq(&tsk->sighand->siglock); - timer = &tsk->signal->real_timer; - if (ovalue) { - ovalue->it_value = itimer_get_remtime(timer); - ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); - } - /* We are sharing ->siglock with it_real_fn() */ - if (hrtimer_try_to_cancel(timer) < 0) { - spin_unlock_irq(&tsk->sighand->siglock); - goto again; - } - expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) { - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); - hrtimer_start(timer, expires, HRTIMER_MODE_REL); - } else - tsk->signal->it_real_incr.tv64 = 0; - - trace_itimer_state(ITIMER_REAL, value, 0); - spin_unlock_irq(&tsk->sighand->siglock); - break; - case ITIMER_VIRTUAL: - set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); - break; - case ITIMER_PROF: - set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); - break; - default: - return -EINVAL; - } - return 0; -} - -/** - * alarm_setitimer - set alarm in seconds - * - * @seconds: number of seconds until alarm - * 0 disables the alarm - * - * Returns the remaining time in seconds of a pending timer or 0 when - * the timer is not active. - * - * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid - * negative timeval settings which would cause immediate expiry. - */ -unsigned int alarm_setitimer(unsigned int seconds) -{ - struct itimerval it_new, it_old; - -#if BITS_PER_LONG < 64 - if (seconds > INT_MAX) - seconds = INT_MAX; -#endif - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - - do_setitimer(ITIMER_REAL, &it_new, &it_old); - - /* - * We can't return 0 if we have an alarm pending ... And we'd - * better return too much than too little anyway - */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) - it_old.it_value.tv_sec++; - - return it_old.it_value.tv_sec; -} - -SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, - struct itimerval __user *, ovalue) -{ - struct itimerval set_buffer, get_buffer; - int error; - - if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; - } else - memset((char *) &set_buffer, 0, sizeof(set_buffer)); - - error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); - if (error || !ovalue) - return error; - - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) - return -EFAULT; - return 0; -} -/* - * jump label support - * - * Copyright (C) 2009 Jason Baron - * Copyright (C) 2011 Peter Zijlstra - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef HAVE_JUMP_LABEL - -/* mutex to protect coming/going of the the jump_label table */ -static DEFINE_MUTEX(jump_label_mutex); - -void jump_label_lock(void) -{ - mutex_lock(&jump_label_mutex); -} - -void jump_label_unlock(void) -{ - mutex_unlock(&jump_label_mutex); -} - -bool jump_label_enabled(struct jump_label_key *key) -{ - return !!atomic_read(&key->enabled); -} - -static int jump_label_cmp(const void *a, const void *b) -{ - const struct jump_entry *jea = a; - const struct jump_entry *jeb = b; - - if (jea->key < jeb->key) - return -1; - - if (jea->key > jeb->key) - return 1; - - return 0; -} - -static void -jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) -{ - unsigned long size; - - size = (((unsigned long)stop - (unsigned long)start) - / sizeof(struct jump_entry)); - sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); -} - -static void jump_label_update(struct jump_label_key *key, int enable); - -void jump_label_inc(struct jump_label_key *key) -{ - if (atomic_inc_not_zero(&key->enabled)) - return; - - jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); - atomic_inc(&key->enabled); - jump_label_unlock(); -} -EXPORT_SYMBOL_GPL(jump_label_inc); - -static void __jump_label_dec(struct jump_label_key *key, - unsigned long rate_limit, struct delayed_work *work) -{ - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) - return; - - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - - jump_label_unlock(); -} -EXPORT_SYMBOL_GPL(jump_label_dec); - -static void jump_label_update_timeout(struct work_struct *work) -{ - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); -} - -void jump_label_dec(struct jump_label_key *key) -{ - __jump_label_dec(key, 0, NULL); -} - -void jump_label_dec_deferred(struct jump_label_key_deferred *key) -{ - __jump_label_dec(&key->key, key->timeout, &key->work); -} - - -void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl) -{ - key->timeout = rl; - INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); -} - -static int addr_conflict(struct jump_entry *entry, void *start, void *end) -{ - if (entry->code <= (unsigned long)end && - entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) - return 1; - - return 0; -} - -static int __jump_label_text_reserved(struct jump_entry *iter_start, - struct jump_entry *iter_stop, void *start, void *end) -{ - struct jump_entry *iter; - - iter = iter_start; - while (iter < iter_stop) { - if (addr_conflict(iter, start, end)) - return 1; - iter++; - } - - return 0; -} - -/* - * Update code which is definitely not currently executing. - * Architectures which need heavyweight synchronization to modify - * running code can override this to make the non-live update case - * cheaper. - */ -void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - arch_jump_label_transform(entry, type); -} - -static void __jump_label_update(struct jump_label_key *key, - struct jump_entry *entry, - struct jump_entry *stop, int enable) -{ - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { - /* - * entry->code set to 0 invalidates module init text sections - * kernel_text_address() verifies we are not in core kernel - * init code, see jump_label_invalidate_module_init(). - */ - if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); - } -} - -void __init jump_label_init(void) -{ - struct jump_entry *iter_start = __start___jump_table; - struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; - struct jump_entry *iter; - - jump_label_lock(); - jump_label_sort_entries(iter_start, iter_stop); - - for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); - if (iterk == key) - continue; - - key = iterk; - key->entries = iter; -#ifdef CONFIG_MODULES - key->next = NULL; -#endif - } - jump_label_unlock(); -} - -#ifdef CONFIG_MODULES - -struct jump_label_mod { - struct jump_label_mod *next; - struct jump_entry *entries; - struct module *mod; -}; - -static int __jump_label_mod_text_reserved(void *start, void *end) -{ - struct module *mod; - - mod = __module_text_address((unsigned long)start); - if (!mod) - return 0; - - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - - return __jump_label_text_reserved(mod->jump_entries, - mod->jump_entries + mod->num_jump_entries, - start, end); -} - -static void __jump_label_mod_update(struct jump_label_key *key, int enable) -{ - struct jump_label_mod *mod = key->next; - - while (mod) { - struct module *m = mod->mod; - - __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; - } -} - -/*** - * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() - * @mod: module to patch - * - * Allow for run-time selection of the optimal nops. Before the module - * loads patch these with arch_get_jump_label_nop(), which is specified by - * the arch specific jump label code. - */ -void jump_label_apply_nops(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - - /* if the module doesn't have jump label entries, just return */ - if (iter_start == iter_stop) - return; - - for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); - } -} - -static int jump_label_add_module(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; - - /* if the module doesn't have jump label entries, just return */ - if (iter_start == iter_stop) - return 0; - - jump_label_sort_entries(iter_start, iter_stop); - - for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; - - key = (struct jump_label_key *)(unsigned long)iter->key; - - if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; - key->next = NULL; - continue; - } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); - if (!jlm) - return -ENOMEM; - - jlm->mod = mod; - jlm->entries = iter; - jlm->next = key->next; - key->next = jlm; - - if (jump_label_enabled(key)) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); - } - - return 0; -} - -static void jump_label_del_module(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; - - for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; - - key = (struct jump_label_key *)(unsigned long)iter->key; - - if (__module_address(iter->key) == mod) - continue; - - prev = &key->next; - jlm = key->next; - - while (jlm && jlm->mod != mod) { - prev = &jlm->next; - jlm = jlm->next; - } - - if (jlm) { - *prev = jlm->next; - kfree(jlm); - } - } -} - -static void jump_label_invalidate_module_init(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - - for (iter = iter_start; iter < iter_stop; iter++) { - if (within_module_init(iter->code, mod)) - iter->code = 0; - } -} - -static int -jump_label_module_notify(struct notifier_block *self, unsigned long val, - void *data) -{ - struct module *mod = data; - int ret = 0; - - switch (val) { - case MODULE_STATE_COMING: - jump_label_lock(); - ret = jump_label_add_module(mod); - if (ret) - jump_label_del_module(mod); - jump_label_unlock(); - break; - case MODULE_STATE_GOING: - jump_label_lock(); - jump_label_del_module(mod); - jump_label_unlock(); - break; - case MODULE_STATE_LIVE: - jump_label_lock(); - jump_label_invalidate_module_init(mod); - jump_label_unlock(); - break; - } - - return notifier_from_errno(ret); -} - -struct notifier_block jump_label_module_nb = { - .notifier_call = jump_label_module_notify, - .priority = 1, /* higher than tracepoints */ -}; - -static __init int jump_label_init_module(void) -{ - return register_module_notifier(&jump_label_module_nb); -} -early_initcall(jump_label_init_module); - -#endif /* CONFIG_MODULES */ - -/*** - * jump_label_text_reserved - check if addr range is reserved - * @start: start text addr - * @end: end text addr - * - * checks if the text addr located between @start and @end - * overlaps with any of the jump label patch addresses. Code - * that wants to modify kernel text should first verify that - * it does not overlap with any of the jump label addresses. - * Caller must hold jump_label_mutex. - * - * returns 1 if there is an overlap, 0 otherwise - */ -int jump_label_text_reserved(void *start, void *end) -{ - int ret = __jump_label_text_reserved(__start___jump_table, - __stop___jump_table, start, end); - - if (ret) - return ret; - -#ifdef CONFIG_MODULES - ret = __jump_label_mod_text_reserved(start, end); -#endif - return ret; -} - -static void jump_label_update(struct jump_label_key *key, int enable) -{ - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; - -#ifdef CONFIG_MODULES - struct module *mod = __module_address((jump_label_t)key); - - __jump_label_mod_update(key, enable); - - if (mod) - stop = mod->jump_entries + mod->num_jump_entries; -#endif - /* if there are no users, entry can be NULL */ - if (entry) - __jump_label_update(key, entry, stop, enable); -} - -#endif -/* - * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. - * - * Rewritten and vastly simplified by Rusty Russell for in-kernel - * module loader: - * Copyright 2002 Rusty Russell IBM Corporation - * - * ChangeLog: - * - * (25/Aug/2004) Paulo Marques - * Changed the compression method from stem compression to "table lookup" - * compression (see scripts/kallsyms.c for a more complete description) - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for cond_resched */ -#include -#include -#include - -#include - -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned long kallsyms_num_syms -__attribute__((weak, section(".rodata"))); - -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); - -extern const unsigned long kallsyms_markers[] __attribute__((weak)); - -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (all_var) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - -/* - * Expand a compressed symbol data into the resulting uncompressed string, - * given the offset to where the symbol is in the compressed stream. - */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) -{ - int len, skipped_first = 0; - const u8 *tptr, *data; - - /* Get the compressed symbol length from the first symbol byte. */ - data = &kallsyms_names[off]; - len = *data; - data++; - - /* - * Update the offset to return the offset for the next symbol on - * the compressed stream. - */ - off += len + 1; - - /* - * For every byte on the compressed symbol data, copy the table - * entry for that byte. - */ - while (len) { - tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; - data++; - len--; - - while (*tptr) { - if (skipped_first) { - *result = *tptr; - result++; - } else - skipped_first = 1; - tptr++; - } - } - - *result = '\0'; - - /* Return to offset to the next symbol. */ - return off; -} - -/* - * Get symbol type information. This is encoded as a single char at the - * beginning of the symbol name. - */ -static char kallsyms_get_symbol_type(unsigned int off) -{ - /* - * Get just the first code, look it up in the token table, - * and return the first char from this token. - */ - return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; -} - - -/* - * Find the offset on the compressed stream given and index in the - * kallsyms array. - */ -static unsigned int get_symbol_offset(unsigned long pos) -{ - const u8 *name; - int i; - - /* - * Use the closest marker we have. We have markers every 256 positions, - * so that should be close enough. - */ - name = &kallsyms_names[kallsyms_markers[pos >> 8]]; - - /* - * Sequentially scan all the symbols up to the point we're searching - * for. Every symbol is stored in a [][ bytes of data] format, - * so we just need to add the len to the current pointer for every - * symbol we wish to skip. - */ - for (i = 0; i < (pos & 0xFF); i++) - name = name + (*name) + 1; - - return name - kallsyms_names; -} - -/* Lookup the address for this symbol. Returns 0 if not found. */ -unsigned long kallsyms_lookup_name(const char *name) -{ - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; - - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); - - if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; - } - return module_kallsyms_lookup_name(name); -} -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); - -int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, - unsigned long), - void *data) -{ - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; - int ret; - - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); - if (ret != 0) - return ret; - } - return module_kallsyms_on_each_symbol(fn, data); -} -EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); - -static unsigned long get_symbol_pos(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset) -{ - unsigned long symbol_start = 0, symbol_end = 0; - unsigned long i, low, high, mid; - - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - - /* Do a binary search on the sorted kallsyms_addresses array. */ - low = 0; - high = kallsyms_num_syms; - - while (high - low > 1) { - mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) - low = mid; - else - high = mid; - } - - /* - * Search for the first aliased symbol. Aliased - * symbols are symbols with the same address. - */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) - --low; - - symbol_start = kallsyms_addresses[low]; - - /* Search for next non-aliased symbol. */ - for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; - break; - } - } - - /* If we found no next symbol, we use the end of the section. */ - if (!symbol_end) { - if (is_kernel_inittext(addr)) - symbol_end = (unsigned long)_einittext; - else if (all_var) - symbol_end = (unsigned long)_end; - else - symbol_end = (unsigned long)_etext; - } - - if (symbolsize) - *symbolsize = symbol_end - symbol_start; - if (offset) - *offset = addr - symbol_start; - - return low; -} - -/* - * Lookup an address but don't bother to find any names. - */ -int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, - unsigned long *offset) -{ - char namebuf[KSYM_NAME_LEN]; - if (is_ksym_addr(addr)) - return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); -} - -/* - * Lookup an address - * - modname is set to NULL if it's in the kernel. - * - We guarantee that the returned name is valid until we reschedule even if. - * It resides in a module. - * - We also guarantee that modname will be valid until rescheduled. - */ -const char *kallsyms_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, char *namebuf) -{ - namebuf[KSYM_NAME_LEN - 1] = 0; - namebuf[0] = 0; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, symbolsize, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); - if (modname) - *modname = NULL; - return namebuf; - } - - /* See if it's in a module. */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); -} - -int lookup_symbol_name(unsigned long addr, char *symname) -{ - symname[0] = '\0'; - symname[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, NULL, NULL); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); - return 0; - } - /* See if it's in a module. */ - return lookup_module_symbol_name(addr, symname); -} - -int lookup_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - name[0] = '\0'; - name[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, size, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); - modname[0] = '\0'; - return 0; - } - /* See if it's in a module. */ - return lookup_module_symbol_attrs(addr, size, offset, modname, name); -} - -/* Look up a kernel symbol and return it in a text buffer. */ -static int __sprint_symbol(char *buffer, unsigned long address, - int symbol_offset) -{ - char *modname; - const char *name; - unsigned long offset, size; - int len; - - address += symbol_offset; - name = kallsyms_lookup(address, &size, &offset, &modname, buffer); - if (!name) - return sprintf(buffer, "0x%lx", address); - - if (name != buffer) - strcpy(buffer, name); - len = strlen(buffer); - buffer += len; - offset -= symbol_offset; - - if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); - else - len += sprintf(buffer, "+%#lx/%#lx", offset, size); - - return len; -} - -/** - * sprint_symbol - Look up a kernel symbol and return it in a text buffer - * @buffer: buffer to be stored - * @address: address to lookup - * - * This function looks up a kernel symbol with @address and stores its name, - * offset, size and module name to @buffer if possible. If no symbol was found, - * just saves its @address as is. - * - * This function returns the number of bytes stored in @buffer. - */ -int sprint_symbol(char *buffer, unsigned long address) -{ - return __sprint_symbol(buffer, address, 0); -} - -EXPORT_SYMBOL_GPL(sprint_symbol); - -/** - * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer - * @buffer: buffer to be stored - * @address: address to lookup - * - * This function is for stack backtrace and does the same thing as - * sprint_symbol() but with modified/decreased @address. If there is a - * tail-call to the function marked "noreturn", gcc optimized out code after - * the call so that the stack-saved return address could point outside of the - * caller. This function ensures that kallsyms will find the original caller - * by decreasing @address. - * - * This function returns the number of bytes stored in @buffer. - */ -int sprint_backtrace(char *buffer, unsigned long address) -{ - return __sprint_symbol(buffer, address, -1); -} - -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} -EXPORT_SYMBOL(__print_symbol); - -/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ -struct kallsym_iter { - loff_t pos; - unsigned long value; - unsigned int nameoff; /* If iterating in core kernel symbols. */ - char type; - char name[KSYM_NAME_LEN]; - char module_name[MODULE_NAME_LEN]; - int exported; -}; - -static int get_ksymbol_mod(struct kallsym_iter *iter) -{ - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) - return 0; - return 1; -} - -/* Returns space to next name. */ -static unsigned long get_ksymbol_core(struct kallsym_iter *iter) -{ - unsigned off = iter->nameoff; - - iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; - - iter->type = kallsyms_get_symbol_type(off); - - off = kallsyms_expand_symbol(off, iter->name); - - return off - iter->nameoff; -} - -static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) -{ - iter->name[0] = '\0'; - iter->nameoff = get_symbol_offset(new_pos); - iter->pos = new_pos; -} - -/* Returns false if pos at or past end of file. */ -static int update_iter(struct kallsym_iter *iter, loff_t pos) -{ - /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } - - /* If we're not on the desired position, reset to new position. */ - if (pos != iter->pos) - reset_iter(iter, pos); - - iter->nameoff += get_ksymbol_core(iter); - iter->pos++; - - return 1; -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - (*pos)++; - - if (!update_iter(m->private, *pos)) - return NULL; - return p; -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - if (!update_iter(m->private, *pos)) - return NULL; - return m->private; -} - -static void s_stop(struct seq_file *m, void *p) -{ -} - -static int s_show(struct seq_file *m, void *p) -{ - struct kallsym_iter *iter = m->private; - - /* Some debugging symbols have no name. Ignore them. */ - if (!iter->name[0]) - return 0; - - if (iter->module_name[0]) { - char type; - - /* - * Label it "global" if it is exported, - * "local" if not exported. - */ - type = iter->exported ? toupper(iter->type) : - tolower(iter->type); - seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, - type, iter->name, iter->module_name); - } else - seq_printf(m, "%pK %c %s\n", (void *)iter->value, - iter->type, iter->name); - return 0; -} - -static const struct seq_operations kallsyms_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show -}; - -static int kallsyms_open(struct inode *inode, struct file *file) -{ - /* - * We keep iterator in m->private, since normal case is to - * s_start from where we left off, so we avoid doing - * using get_symbol_offset for every symbol. - */ - struct kallsym_iter *iter; - int ret; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - reset_iter(iter, 0); - - ret = seq_open(file, &kallsyms_op); - if (ret == 0) - ((struct seq_file *)file->private_data)->private = iter; - else - kfree(iter); - return ret; -} - -#ifdef CONFIG_KGDB_KDB -const char *kdb_walk_kallsyms(loff_t *pos) -{ - static struct kallsym_iter kdb_walk_kallsyms_iter; - if (*pos == 0) { - memset(&kdb_walk_kallsyms_iter, 0, - sizeof(kdb_walk_kallsyms_iter)); - reset_iter(&kdb_walk_kallsyms_iter, 0); - } - while (1) { - if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) - return NULL; - ++*pos; - /* Some debugging symbols have no name. Ignore them. */ - if (kdb_walk_kallsyms_iter.name[0]) - return kdb_walk_kallsyms_iter.name; - } -} -#endif /* CONFIG_KGDB_KDB */ - -static const struct file_operations kallsyms_operations = { - .open = kallsyms_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static int __init kallsyms_init(void) -{ - proc_create("kallsyms", 0444, NULL, &kallsyms_operations); - return 0; -} -device_initcall(kallsyms_init); -/* - * kexec.c - kexec system call - * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in . - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); - -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addresses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - goto out; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; - } - - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; - -} - -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - - /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; - - *rimage = image; - - /* - * Find a location for the control code buffer, and add it - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; - } - - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); - goto out; - } - - result = 0; - out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; -} - -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - unsigned long i; - - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; - goto out; - } - - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; - - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out; - } - - /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; - } - - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); - - return result; -} - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(GFP_KERNEL, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everything to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) - break; - if (hole_end > crashk_res.end) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); - -} -static void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } - else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } - else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - unsigned long ubytes, mbytes; - int result; - unsigned char __user *buf; - - result = 0; - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; - - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - unsigned long ubytes, mbytes; - int result; - unsigned char __user *buf; - - result = 0; - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - -/* - * Exec Kernel system call: for obvious reasons only root may call it. - * - * This call breaks up into three pieces. - * - A generic part which loads the new kernel from the current - * address space, and very carefully places the data in the - * allocated pages. - * - * - A generic part that interacts with the kernel and tells all of - * the devices to shut down. Preventing on-going dmas, and placing - * the devices in a consistent state so a later kernel can - * reinitialize them. - * - * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And - * jumps into the image at entry. - * - * kexec does not sync, or unmount filesystems so if you need - * that to happen you need to do that yourself. - */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; - -static DEFINE_MUTEX(kexec_mutex); - -SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, - struct kexec_segment __user *, segments, unsigned long, flags) -{ - struct kimage **dest_image, *image; - int result; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* - * Verify we have a legal set of flags - * This leaves us room for future extensions. - */ - if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) - return -EINVAL; - - /* Verify we are on the appropriate architecture */ - if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && - ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - return -EINVAL; - - /* Put an artificial cap on the number - * of segments passed to kexec_load. - */ - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - - image = NULL; - result = 0; - - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) - dest_image = &kexec_crash_image; - if (nr_segments > 0) { - unsigned long i; - - /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); - /* Loading another kernel to switch to if this one crashes */ - else if (flags & KEXEC_ON_CRASH) { - /* Free any current crash dump kernel before - * we corrupt it. - */ - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); - crash_map_reserved_pages(); - } - if (result) - goto out; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto out; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto out; - } - kimage_terminate(image); - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); - -out: - mutex_unlock(&kexec_mutex); - kimage_free(image); - - return result; -} - -/* - * Add and remove page tables for crashkernel memory - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - -#ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) -{ - struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; - unsigned long i, result; - - /* Don't allow clients that don't understand the native - * architecture to do anything. - */ - if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) - return -EINVAL; - - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { - result = copy_from_user(&in, &segments[i], sizeof(in)); - if (result) - return -EFAULT; - - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; - } - - return sys_kexec_load(entry, nr_segments, ksegments, flags); -} -#endif - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -size_t crash_get_memory_size(void) -{ - size_t size = 0; - mutex_lock(&kexec_mutex); - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); - return size; -} - -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; - - mutex_lock(&kexec_mutex); - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_map_reserved_pages(); - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; - ram_res->name = "System RAM"; - - crashk_res.end = end - 1; - - insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); - -unlock: - mutex_unlock(&kexec_mutex); - return ret; -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); - return -ENOMEM; - } - return 0; -} -module_init(crash_notes_memory_init) - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warning("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - - return 0; -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *p = cmdline, *ck_cmdline = NULL; - char *first_colon, *first_space; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); - while (p) { - ck_cmdline = p; - p = strstr(p+1, "crashkernel="); - } - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += 12; /* strlen("crashkernel=") */ - - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - - return 0; -} - - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - int r; - - va_start(args, fmt); - r = vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); - VMCOREINFO_SYMBOL(swapper_pg_dir); - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -module_init(crash_save_vmcoreinfo_init) - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - lock_system_sleep(); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_noirq(PMSG_FREEZE); - if (error) - goto Resume_devices; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - local_irq_disable(); - error = syscore_suspend(); - if (error) - goto Enable_irqs; - } else -#endif - { - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - syscore_resume(); - Enable_irqs: - local_irq_enable(); - Enable_cpus: - enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); - Resume_devices: - dpm_resume_end(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - unlock_system_sleep(); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} -/* - * A generic kernel FIFO implementation - * - * Copyright (C) 2009/2010 Stefani Seibold - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * internal helper to calculate the unused elements in a fifo - */ -static inline unsigned int kfifo_unused(struct __kfifo *fifo) -{ - return (fifo->mask + 1) - (fifo->in - fifo->out); -} - -int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask) -{ - /* - * round down to the next power of 2, since our 'let the indices - * wrap' technique works only in this case. - */ - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - - if (size < 2) { - fifo->data = NULL; - fifo->mask = 0; - return -EINVAL; - } - - fifo->data = kmalloc(size * esize, gfp_mask); - - if (!fifo->data) { - fifo->mask = 0; - return -ENOMEM; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_alloc); - -void __kfifo_free(struct __kfifo *fifo) -{ - kfree(fifo->data); - fifo->in = 0; - fifo->out = 0; - fifo->esize = 0; - fifo->data = NULL; - fifo->mask = 0; -} -EXPORT_SYMBOL(__kfifo_free); - -int __kfifo_init(struct __kfifo *fifo, void *buffer, - unsigned int size, size_t esize) -{ - size /= esize; - - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - fifo->data = buffer; - - if (size < 2) { - fifo->mask = 0; - return -EINVAL; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_init); - -static void kfifo_copy_in(struct __kfifo *fifo, const void *src, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(fifo->data + off, src, l); - memcpy(fifo->data, src + l, len - l); - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_in(struct __kfifo *fifo, - const void *buf, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - kfifo_copy_in(fifo, buf, len, fifo->in); - fifo->in += len; - return len; -} -EXPORT_SYMBOL(__kfifo_in); - -static void kfifo_copy_out(struct __kfifo *fifo, void *dst, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(dst, fifo->data + off, l); - memcpy(dst + l, fifo->data, len - l); - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_out_peek(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - kfifo_copy_out(fifo, buf, len, fifo->out); - return len; -} -EXPORT_SYMBOL(__kfifo_out_peek); - -unsigned int __kfifo_out(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - len = __kfifo_out_peek(fifo, buf, len); - fifo->out += len; - return len; -} -EXPORT_SYMBOL(__kfifo_out); - -static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned int *copied) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned long ret; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_from_user(fifo->data + off, from, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_from_user(fifo->data, from + l, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->in += len; - return err; -} -EXPORT_SYMBOL(__kfifo_from_user); - -static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, - unsigned int len, unsigned int off, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_to_user(to, fifo->data + off, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_to_user(to + l, fifo->data, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_to_user(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->out += len; - return err; -} -EXPORT_SYMBOL(__kfifo_to_user); - -static int setup_sgl_buf(struct scatterlist *sgl, void *buf, - int nents, unsigned int len) -{ - int n; - unsigned int l; - unsigned int off; - struct page *page; - - if (!nents) - return 0; - - if (!len) - return 0; - - n = 0; - page = virt_to_page(buf); - off = offset_in_page(buf); - l = 0; - - while (len >= l + PAGE_SIZE - off) { - struct page *npage; - - l += PAGE_SIZE; - buf += PAGE_SIZE; - npage = virt_to_page(buf); - if (page_to_phys(page) != page_to_phys(npage) - l) { - sg_set_page(sgl, page, l - off, off); - sgl = sg_next(sgl); - if (++n == nents || sgl == NULL) - return n; - page = npage; - len -= l - off; - l = off = 0; - } - } - sg_set_page(sgl, page, len, off); - return n + 1; -} - -static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, - int nents, unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned int n; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - n = setup_sgl_buf(sgl, fifo->data + off, nents, l); - n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - - return n; -} - -unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->in); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare); - -unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->out); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare); - -unsigned int __kfifo_max_r(unsigned int len, size_t recsize) -{ - unsigned int max = (1 << (recsize << 3)) - 1; - - if (len > max) - return max; - return len; -} - -#define __KFIFO_PEEK(data, out, mask) \ - ((data)[(out) & (mask)]) -/* - * __kfifo_peek_n internal helper function for determinate the length of - * the next record in the fifo - */ -static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) -{ - unsigned int l; - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - l = __KFIFO_PEEK(data, fifo->out, mask); - - if (--recsize) - l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; - - return l; -} - -#define __KFIFO_POKE(data, in, mask, val) \ - ( \ - (data)[(in) & (mask)] = (unsigned char)(val) \ - ) - -/* - * __kfifo_poke_n internal helper function for storeing the length of - * the record into the fifo - */ -static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) -{ - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - __KFIFO_POKE(data, fifo->in, mask, n); - - if (recsize > 1) - __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); -} - -unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) -{ - return __kfifo_peek_n(fifo, recsize); -} -EXPORT_SYMBOL(__kfifo_len_r); - -unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, - unsigned int len, size_t recsize) -{ - if (len + recsize > kfifo_unused(fifo)) - return 0; - - __kfifo_poke_n(fifo, len, recsize); - - kfifo_copy_in(fifo, buf, len, fifo->in + recsize); - fifo->in += len + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_in_r); - -static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, - void *buf, unsigned int len, size_t recsize, unsigned int *n) -{ - *n = __kfifo_peek_n(fifo, recsize); - - if (len > *n) - len = *n; - - kfifo_copy_out(fifo, buf, len, fifo->out + recsize); - return len; -} - -unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - return kfifo_out_copy_r(fifo, buf, len, recsize, &n); -} -EXPORT_SYMBOL(__kfifo_out_peek_r); - -unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); - fifo->out += n + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_out_r); - -void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int n; - - n = __kfifo_peek_n(fifo, recsize); - fifo->out += n + recsize; -} -EXPORT_SYMBOL(__kfifo_skip_r); - -int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) { - *copied = 0; - return 0; - } - - __kfifo_poke_n(fifo, len, recsize); - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->in += len + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_from_user_r); - -int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - unsigned int n; - - if (fifo->in == fifo->out) { - *copied = 0; - return 0; - } - - n = __kfifo_peek_n(fifo, recsize); - if (len > n) - len = n; - - ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->out += n + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_to_user_r); - -unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); - -void __kfifo_dma_in_finish_r(struct __kfifo *fifo, - unsigned int len, size_t recsize) -{ - len = __kfifo_max_r(len, recsize); - __kfifo_poke_n(fifo, len, recsize); - fifo->in += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_in_finish_r); - -unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > fifo->in - fifo->out) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); - -void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int len; - - len = __kfifo_peek_n(fifo, recsize); - fifo->out += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_out_finish_r); -/* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell Jan 2003 -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern int max_threads; - -static struct workqueue_struct *khelper_wq; - -#define CAP_BSET (void *)1 -#define CAP_PI (void *)2 - -static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; -static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; -static DEFINE_SPINLOCK(umh_sysctl_lock); -static DECLARE_RWSEM(umhelper_sem); - -#ifdef CONFIG_MODULES - -/* - modprobe_path is set via /proc/sys. -*/ -char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; - -/** - * __request_module - try to load a kernel module - * @wait: wait (or not) for the operation to complete - * @fmt: printf style format string for the name of the module - * @...: arguments as specified in the format string - * - * Load a module using the user mode module loader. The function returns - * zero on success or a negative errno code on failure. Note that a - * successful module load does not mean the module did not then unload - * and exit on an error of its own. Callers must check that the service - * they requested is now available not blindly invoke it. - * - * If module auto-loading support is disabled then this function - * becomes a no-operation. - */ -int __request_module(bool wait, const char *fmt, ...) -{ - va_list args; - char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; - int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ - static int kmod_loop_msg; - - va_start(args, fmt); - ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); - va_end(args); - if (ret >= MODULE_NAME_LEN) - return -ENAMETOOLONG; - - ret = security_kernel_module_request(module_name); - if (ret) - return ret; - - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - atomic_dec(&kmod_concurrent); - return -ENOMEM; - } - - trace_module_request(module_name, wait, _RET_IP_); - - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); - - atomic_dec(&kmod_concurrent); - return ret; -} -EXPORT_SYMBOL(__request_module); -#endif /* CONFIG_MODULES */ - -/* - * This is the task which runs the usermode application - */ -static int ____call_usermodehelper(void *data) -{ - struct subprocess_info *sub_info = data; - struct cred *new; - int retval; - - spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - spin_unlock_irq(¤t->sighand->siglock); - - /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, cpu_all_mask); - - /* - * Our parent is keventd, which runs with elevated scheduling priority. - * Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = -ENOMEM; - new = prepare_kernel_cred(current); - if (!new) - goto fail; - - spin_lock(&umh_sysctl_lock); - new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); - new->cap_inheritable = cap_intersect(usermodehelper_inheritable, - new->cap_inheritable); - spin_unlock(&umh_sysctl_lock); - - if (sub_info->init) { - retval = sub_info->init(sub_info, new); - if (retval) { - abort_creds(new); - goto fail; - } - } - - commit_creds(new); - - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); - - /* Exec failed? */ -fail: - sub_info->retval = retval; - do_exit(0); -} - -void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} -EXPORT_SYMBOL(call_usermodehelper_freeinfo); - -/* Keventd can't block, but this (a child) can. */ -static int wait_for_helper(void *data) -{ - struct subprocess_info *sub_info = data; - pid_t pid; - - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; - spin_unlock_irq(¤t->sighand->siglock); - - pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But wait_for_helper() always runs as keventd, and put_user() - * to a kernel address works OK for kernel threads, due to their - * having an mm_segment_t which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either ____call_usermodehelper failed and the - * real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - complete(sub_info->complete); - return 0; -} - -/* This is run by khelper thread */ -static void __call_usermodehelper(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; - pid_t pid; - - /* CLONE_VFORK: wait until the usermode helper has execve'd - * successfully We need the data structures to stay around - * until that is done. */ - if (wait == UMH_WAIT_PROC) - pid = kernel_thread(wait_for_helper, sub_info, - CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, - CLONE_VFORK | SIGCHLD); - - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; - complete(sub_info->complete); - } -} - -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - * Should always be manipulated under umhelper_sem acquired for write. - */ -static int usermodehelper_disabled = 1; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_disable() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_disable() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -void read_lock_usermodehelper(void) -{ - down_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(read_lock_usermodehelper); - -void read_unlock_usermodehelper(void) -{ - up_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); - -/** - * usermodehelper_disable - prevent new helpers from being started - */ -int usermodehelper_disable(void) -{ - long retval; - - down_write(&umhelper_sem); - usermodehelper_disabled = 1; - up_write(&umhelper_sem); - - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) - return 0; - - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); - return -EAGAIN; -} - -/** - * usermodehelper_enable - allow new helpers to be started again - */ -void usermodehelper_enable(void) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); -} - -/** - * usermodehelper_is_disabled - check if new helpers are allowed to be started - */ -bool usermodehelper_is_disabled(void) -{ - return usermodehelper_disabled; -} -EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic_inc(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, __call_usermodehelper); - sub_info->path = path; - sub_info->argv = argv; - sub_info->envp = envp; - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_setfns - set a cleanup/init function - * @info: a subprocess_info returned by call_usermodehelper_setup - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - info->cleanup = cleanup; - info->init = init; - info->data = data; -} -EXPORT_SYMBOL(call_usermodehelper_setfns); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when -1 don't wait at all, but you get no useful error back when - * the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of keventd. - * (ie. it runs with full root capabilities). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - helper_lock(); - if (sub_info->path[0] == '\0') - goto out; - - if (!khelper_wq || usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - sub_info->complete = &done; - sub_info->wait = wait; - - queue_work(khelper_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - wait_for_completion(&done); - retval = sub_info->retval; - -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; - - if (write && (!capable(CAP_SETPCAP) || - !capable(CAP_SYS_MODULE))) - return -EPERM; - - /* - * convert from the global kernel_cap_t to the ulong array to print to - * userspace if this is a read. - */ - spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } - spin_unlock(&umh_sysctl_lock); - - t = *table; - t.data = &cap_array; - - /* - * actually read or write and array of ulongs from userspace. Remember - * these are least significant 32 bits first - */ - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; - - /* - * Drop everything not in the new_cap (but don't add things) - */ - spin_lock(&umh_sysctl_lock); - if (write) { - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); - } - spin_unlock(&umh_sysctl_lock); - - return 0; -} - -struct ctl_table usermodehelper_table[] = { - { - .procname = "bset", - .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { - .procname = "inheritable", - .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { } -}; - -void __init usermodehelper_init(void) -{ - khelper_wq = create_singlethread_workqueue("khelper"); - BUG_ON(!khelper_wq); -} -/* - * Kernel Probes (KProbes) - * kernel/kprobes.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S Kernel - * Probes initial implementation (includes suggestions from - * Rusty Russell). - * 2004-Aug Updated by Prasanna S Panchamukhi with - * hlists and exceptions notifier as suggested by Andi Kleen. - * 2004-July Suparna Bhattacharya added jumper probes - * interface to access function arguments. - * 2004-Sep Prasanna S Panchamukhi Changed Kprobes - * exceptions notifier to be first on the priority list. - * 2005-May Hien Nguyen , Jim Keniston - * and Prasanna S Panchamukhi - * added function-return probes. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define KPROBE_HASH_BITS 6 -#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) - - -/* - * Some oddball architectures like 64bit powerpc have function descriptors - * so this must be overridable. - */ -#ifndef kprobe_lookup_name -#define kprobe_lookup_name(name, addr) \ - addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) -#endif - -static int kprobes_initialized; -static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; - -/* NOTE: change this value only with kprobe_mutex held */ -static bool kprobes_all_disarmed; - -/* This protects kprobe_table and optimizing_list */ -static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; -static struct { - raw_spinlock_t lock ____cacheline_aligned_in_smp; -} kretprobe_table_locks[KPROBE_TABLE_SIZE]; - -static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -{ - return &(kretprobe_table_locks[hash].lock); -} - -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {"native_get_debugreg",}, - {"irq_entries_start",}, - {"common_interrupt",}, - {"mcount",}, /* mcount can be called from everywhere */ - {NULL} /* Terminator */ -}; - -#ifdef __ARCH_WANT_KPROBES_INSN_SLOT -/* - * kprobe->ainsn.insn points to the copy of the instruction to be - * single-stepped. x86_64, POWER4 and above have no-exec support and - * stepping on the instruction on a vmalloced/kmalloced/data page - * is a recipe for disaster - */ -struct kprobe_insn_page { - struct list_head list; - kprobe_opcode_t *insns; /* Page of instruction slots */ - int nused; - int ngarbage; - char slot_used[]; -}; - -#define KPROBE_INSN_PAGE_SIZE(slots) \ - (offsetof(struct kprobe_insn_page, slot_used) + \ - (sizeof(char) * (slots))) - -struct kprobe_insn_cache { - struct list_head pages; /* list of kprobe_insn_page */ - size_t insn_size; /* size of instruction slot */ - int nr_garbage; -}; - -static int slots_per_page(struct kprobe_insn_cache *c) -{ - return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); -} - -enum kprobe_slot_state { - SLOT_CLEAN = 0, - SLOT_DIRTY = 1, - SLOT_USED = 2, -}; - -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ -static struct kprobe_insn_cache kprobe_insn_slots = { - .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), - .insn_size = MAX_INSN_SIZE, - .nr_garbage = 0, -}; -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); - -/** - * __get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) -{ - struct kprobe_insn_page *kip; - - retry: - list_for_each_entry(kip, &c->pages, list) { - if (kip->nused < slots_per_page(c)) { - int i; - for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_CLEAN) { - kip->slot_used[i] = SLOT_USED; - kip->nused++; - return kip->insns + (i * c->insn_size); - } - } - /* kip->nused is broken. Fix it. */ - kip->nused = slots_per_page(c); - WARN_ON(1); - } - } - - /* If there are any garbage slots, collect it and try again. */ - if (c->nr_garbage && collect_garbage_slots(c) == 0) - goto retry; - - /* All out of space. Need to allocate a new page. */ - kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); - if (!kip) - return NULL; - - /* - * Use module_alloc so this page is within +/- 2GB of where the - * kernel image and loaded module images reside. This is required - * so x86_64 can correctly handle the %rip-relative fixups. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_LIST_HEAD(&kip->list); - memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); - kip->slot_used[0] = SLOT_USED; - kip->nused = 1; - kip->ngarbage = 0; - list_add(&kip->list, &c->pages); - return kip->insns; -} - - -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(&kprobe_insn_slots); - mutex_unlock(&kprobe_insn_mutex); - - return ret; -} - -/* Return 1 if all garbages are collected, otherwise 0. */ -static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) -{ - kip->slot_used[idx] = SLOT_CLEAN; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - if (!list_is_singular(&kip->list)) { - list_del(&kip->list); - module_free(NULL, kip->insns); - kfree(kip); - } - return 1; - } - return 0; -} - -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) -{ - struct kprobe_insn_page *kip, *next; - - /* Ensure no-one is interrupted on the garbages */ - synchronize_sched(); - - list_for_each_entry_safe(kip, next, &c->pages, list) { - int i; - if (kip->ngarbage == 0) - continue; - kip->ngarbage = 0; /* we will collect all garbages */ - for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_DIRTY && - collect_one_slot(kip, i)) - break; - } - } - c->nr_garbage = 0; - return 0; -} - -static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) -{ - struct kprobe_insn_page *kip; - - list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / - (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) { - WARN_ON(kip->slot_used[idx] != SLOT_USED); - if (dirty) { - kip->slot_used[idx] = SLOT_DIRTY; - kip->ngarbage++; - if (++c->nr_garbage > slots_per_page(c)) - collect_garbage_slots(c); - } else - collect_one_slot(kip, idx); - return; - } - } - /* Could not free this slot. */ - WARN_ON(1); -} - -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_insn_mutex); - __free_insn_slot(&kprobe_insn_slots, slot, dirty); - mutex_unlock(&kprobe_insn_mutex); -} -#ifdef CONFIG_OPTPROBES -/* For optimized_kprobe buffer */ -static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ -static struct kprobe_insn_cache kprobe_optinsn_slots = { - .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), - /* .insn_size is initialized later */ - .nr_garbage = 0, -}; -/* Get a slot for optimized_kprobe buffer */ -kprobe_opcode_t __kprobes *get_optinsn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_optinsn_mutex); - ret = __get_insn_slot(&kprobe_optinsn_slots); - mutex_unlock(&kprobe_optinsn_mutex); - - return ret; -} - -void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_optinsn_mutex); - __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); - mutex_unlock(&kprobe_optinsn_mutex); -} -#endif -#endif - -/* We have preemption disabled.. so it is safe to use __ versions */ -static inline void set_kprobe_instance(struct kprobe *kp) -{ - __this_cpu_write(kprobe_instance, kp); -} - -static inline void reset_kprobe_instance(void) -{ - __this_cpu_write(kprobe_instance, NULL); -} - -/* - * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c - */ -struct kprobe __kprobes *get_kprobe(void *addr) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - - head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, node, head, hlist) { - if (p->addr == addr) - return p; - } - - return NULL; -} - -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); - -/* Return true if the kprobe is an aggregator */ -static inline int kprobe_aggrprobe(struct kprobe *p) -{ - return p->pre_handler == aggr_pre_handler; -} - -/* Return true(!0) if the kprobe is unused */ -static inline int kprobe_unused(struct kprobe *p) -{ - return kprobe_aggrprobe(p) && kprobe_disabled(p) && - list_empty(&p->list); -} - -/* - * Keep all fields in the kprobe consistent - */ -static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) -{ - memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); -} - -#ifdef CONFIG_OPTPROBES -/* NOTE: change this value only with kprobe_mutex held */ -static bool kprobes_allow_optimization; - -/* - * Call all pre_handler on the list, but ignores its return value. - * This must be called from arch-dep optimized caller. - */ -void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler && likely(!kprobe_disabled(kp))) { - set_kprobe_instance(kp); - kp->pre_handler(kp, regs); - } - reset_kprobe_instance(); - } -} - -/* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = container_of(p, struct optimized_kprobe, kp); - arch_remove_optimized_kprobe(op); - arch_remove_kprobe(p); - kfree(op); -} - -/* Return true(!0) if the kprobe is ready for optimization. */ -static inline int kprobe_optready(struct kprobe *p) -{ - struct optimized_kprobe *op; - - if (kprobe_aggrprobe(p)) { - op = container_of(p, struct optimized_kprobe, kp); - return arch_prepared_optinsn(&op->optinsn); - } - - return 0; -} - -/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ -static inline int kprobe_disarmed(struct kprobe *p) -{ - struct optimized_kprobe *op; - - /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ - if (!kprobe_aggrprobe(p)) - return kprobe_disabled(p); - - op = container_of(p, struct optimized_kprobe, kp); - - return kprobe_disabled(p) && list_empty(&op->list); -} - -/* Return true(!0) if the probe is queued on (un)optimizing lists */ -static int __kprobes kprobe_queued(struct kprobe *p) -{ - struct optimized_kprobe *op; - - if (kprobe_aggrprobe(p)) { - op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) - return 1; - } - return 0; -} - -/* - * Return an optimized kprobe whose optimizing code replaces - * instructions including addr (exclude breakpoint). - */ -static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) -{ - int i; - struct kprobe *p = NULL; - struct optimized_kprobe *op; - - /* Don't check i == 0, since that is a breakpoint case. */ - for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) - p = get_kprobe((void *)(addr - i)); - - if (p && kprobe_optready(p)) { - op = container_of(p, struct optimized_kprobe, kp); - if (arch_within_optimized_kprobe(op, addr)) - return p; - } - - return NULL; -} - -/* Optimization staging list, protected by kprobe_mutex */ -static LIST_HEAD(optimizing_list); -static LIST_HEAD(unoptimizing_list); - -static void kprobe_optimizer(struct work_struct *work); -static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); -static DECLARE_COMPLETION(optimizer_comp); -#define OPTIMIZE_DELAY 5 - -/* - * Optimize (replace a breakpoint with a jump) kprobes listed on - * optimizing_list. - */ -static __kprobes void do_optimize_kprobes(void) -{ - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - - /* - * The optimization/unoptimization refers online_cpus via - * stop_machine() and cpu-hotplug modifies online_cpus. - * And same time, text_mutex will be held in cpu-hotplug and here. - * This combination can cause a deadlock (cpu-hotplug try to lock - * text_mutex but stop_machine can not be done because online_cpus - * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() - * for preventing cpu-hotplug outside of text_mutex locking. - */ - get_online_cpus(); - mutex_lock(&text_mutex); - arch_optimize_kprobes(&optimizing_list); - mutex_unlock(&text_mutex); - put_online_cpus(); -} - -/* - * Unoptimize (replace a jump with a breakpoint and remove the breakpoint - * if need) kprobes listed on unoptimizing_list. - */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) -{ - struct optimized_kprobe *op, *tmp; - - /* Unoptimization must be done anytime */ - if (list_empty(&unoptimizing_list)) - return; - - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); - mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); - /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { - /* Disarm probes if marked disabled */ - if (kprobe_disabled(&op->kp)) - arch_disarm_kprobe(&op->kp); - if (kprobe_unused(&op->kp)) { - /* - * Remove unused probes from hash list. After waiting - * for synchronization, these probes are reclaimed. - * (reclaiming is done by do_free_cleaned_kprobes.) - */ - hlist_del_rcu(&op->kp.hlist); - } else - list_del_init(&op->list); - } - mutex_unlock(&text_mutex); - put_online_cpus(); -} - -/* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) -{ - struct optimized_kprobe *op, *tmp; - - list_for_each_entry_safe(op, tmp, free_list, list) { - BUG_ON(!kprobe_unused(&op->kp)); - list_del_init(&op->list); - free_aggr_kprobe(&op->kp); - } -} - -/* Start optimizer after OPTIMIZE_DELAY passed */ -static __kprobes void kick_kprobe_optimizer(void) -{ - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); -} - -/* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) -{ - LIST_HEAD(free_list); - - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); - - /* - * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) - * kprobes before waiting for quiesence period. - */ - do_unoptimize_kprobes(&free_list); - - /* - * Step 2: Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. - */ - synchronize_sched(); - - /* Step 3: Optimize kprobes after quiesence period */ - do_optimize_kprobes(); - - /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); - - mutex_unlock(&kprobe_mutex); - mutex_unlock(&module_mutex); - - /* Step 5: Kick optimizer again if needed */ - if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) - kick_kprobe_optimizer(); - else - /* Wake up all waiters */ - complete_all(&optimizer_comp); -} - -/* Wait for completing optimization and unoptimization */ -static __kprobes void wait_for_kprobe_optimizer(void) -{ - if (delayed_work_pending(&optimizing_work)) - wait_for_completion(&optimizer_comp); -} - -/* Optimize kprobe if p is ready to be optimized */ -static __kprobes void optimize_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - /* Check if the kprobe is disabled or not ready for optimization. */ - if (!kprobe_optready(p) || !kprobes_allow_optimization || - (kprobe_disabled(p) || kprobes_all_disarmed)) - return; - - /* Both of break_handler and post_handler are not supported. */ - if (p->break_handler || p->post_handler) - return; - - op = container_of(p, struct optimized_kprobe, kp); - - /* Check there is no other kprobes at the optimized instructions */ - if (arch_check_optimized_kprobe(op) < 0) - return; - - /* Check if it is already optimized. */ - if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) - return; - op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - - if (!list_empty(&op->list)) - /* This is under unoptimizing. Just dequeue the probe */ - list_del_init(&op->list); - else { - list_add(&op->list, &optimizing_list); - kick_kprobe_optimizer(); - } -} - -/* Short cut to direct unoptimizing */ -static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) -{ - get_online_cpus(); - arch_unoptimize_kprobe(op); - put_online_cpus(); - if (kprobe_disabled(&op->kp)) - arch_disarm_kprobe(&op->kp); -} - -/* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) -{ - struct optimized_kprobe *op; - - if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) - return; /* This is not an optprobe nor optimized */ - - op = container_of(p, struct optimized_kprobe, kp); - if (!kprobe_optimized(p)) { - /* Unoptimized or unoptimizing case */ - if (force && !list_empty(&op->list)) { - /* - * Only if this is unoptimizing kprobe and forced, - * forcibly unoptimize it. (No need to unoptimize - * unoptimized kprobe again :) - */ - list_del_init(&op->list); - force_unoptimize_kprobe(op); - } - return; - } - - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - if (!list_empty(&op->list)) { - /* Dequeue from the optimization queue */ - list_del_init(&op->list); - return; - } - /* Optimized kprobe case */ - if (force) - /* Forcibly update the code: this is a special case */ - force_unoptimize_kprobe(op); - else { - list_add(&op->list, &unoptimizing_list); - kick_kprobe_optimizer(); - } -} - -/* Cancel unoptimizing for reusing */ -static void reuse_unused_kprobe(struct kprobe *ap) -{ - struct optimized_kprobe *op; - - BUG_ON(!kprobe_unused(ap)); - /* - * Unused kprobe MUST be on the way of delayed unoptimizing (means - * there is still a relative jump) and disabled. - */ - op = container_of(ap, struct optimized_kprobe, kp); - if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); - /* Enable the probe again */ - ap->flags &= ~KPROBE_FLAG_DISABLED; - /* Optimize it again (remove from op->list) */ - BUG_ON(!kprobe_optready(ap)); - optimize_kprobe(ap); -} - -/* Remove optimized instructions */ -static void __kprobes kill_optimized_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) - /* Dequeue from the (un)optimization queue */ - list_del_init(&op->list); - - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - /* Don't touch the code, because it is already freed. */ - arch_remove_optimized_kprobe(op); -} - -/* Try to prepare optimized instructions */ -static __kprobes void prepare_optimized_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op); -} - -/* Allocate new optimized_kprobe and try to prepare optimized instructions */ -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); - if (!op) - return NULL; - - INIT_LIST_HEAD(&op->list); - op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op); - - return &op->kp; -} - -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); - -/* - * Prepare an optimized_kprobe and optimize it - * NOTE: p must be a normal registered kprobe - */ -static __kprobes void try_to_optimize_kprobe(struct kprobe *p) -{ - struct kprobe *ap; - struct optimized_kprobe *op; - - ap = alloc_aggr_kprobe(p); - if (!ap) - return; - - op = container_of(ap, struct optimized_kprobe, kp); - if (!arch_prepared_optinsn(&op->optinsn)) { - /* If failed to setup optimizing, fallback to kprobe */ - arch_remove_optimized_kprobe(op); - kfree(op); - return; - } - - init_aggr_kprobe(ap, p); - optimize_kprobe(ap); -} - -#ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */ -static void __kprobes optimize_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - /* If optimization is already allowed, just return */ - if (kprobes_allow_optimization) - return; - - kprobes_allow_optimization = true; - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) - if (!kprobe_disabled(p)) - optimize_kprobe(p); - } - printk(KERN_INFO "Kprobes globally optimized\n"); -} - -/* This should be called with kprobe_mutex locked */ -static void __kprobes unoptimize_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - /* If optimization is already prohibited, just return */ - if (!kprobes_allow_optimization) - return; - - kprobes_allow_optimization = false; - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!kprobe_disabled(p)) - unoptimize_kprobe(p, false); - } - } - /* Wait for unoptimizing completion */ - wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); -} - -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) -{ - int ret; - - mutex_lock(&kprobe_mutex); - sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - - if (sysctl_kprobes_optimization) - optimize_all_kprobes(); - else - unoptimize_all_kprobes(); - mutex_unlock(&kprobe_mutex); - - return ret; -} -#endif /* CONFIG_SYSCTL */ - -/* Put a breakpoint for a probe. Must be called with text_mutex locked */ -static void __kprobes __arm_kprobe(struct kprobe *p) -{ - struct kprobe *_p; - - /* Check collision with other optimized kprobes */ - _p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(_p)) - /* Fallback to unoptimized kprobe */ - unoptimize_kprobe(_p, true); - - arch_arm_kprobe(p); - optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ -} - -/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ -static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) -{ - struct kprobe *_p; - - unoptimize_kprobe(p, false); /* Try to unoptimize */ - - if (!kprobe_queued(p)) { - arch_disarm_kprobe(p); - /* If another kprobe was blocked, optimize it. */ - _p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(_p) && reopt) - optimize_kprobe(_p); - } - /* TODO: reoptimize others after unoptimized this probe */ -} - -#else /* !CONFIG_OPTPROBES */ - -#define optimize_kprobe(p) do {} while (0) -#define unoptimize_kprobe(p, f) do {} while (0) -#define kill_optimized_kprobe(p) do {} while (0) -#define prepare_optimized_kprobe(p) do {} while (0) -#define try_to_optimize_kprobe(p) do {} while (0) -#define __arm_kprobe(p) arch_arm_kprobe(p) -#define __disarm_kprobe(p, o) arch_disarm_kprobe(p) -#define kprobe_disarmed(p) kprobe_disabled(p) -#define wait_for_kprobe_optimizer() do {} while (0) - -/* There should be no unused kprobes can be reused without optimization */ -static void reuse_unused_kprobe(struct kprobe *ap) -{ - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); - BUG_ON(kprobe_unused(ap)); -} - -static __kprobes void free_aggr_kprobe(struct kprobe *p) -{ - arch_remove_kprobe(p); - kfree(p); -} - -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) -{ - return kzalloc(sizeof(struct kprobe), GFP_KERNEL); -} -#endif /* CONFIG_OPTPROBES */ - -/* Arm a kprobe with text_mutex */ -static void __kprobes arm_kprobe(struct kprobe *kp) -{ - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ - mutex_lock(&text_mutex); - __arm_kprobe(kp); - mutex_unlock(&text_mutex); -} - -/* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp) -{ - /* Ditto */ - mutex_lock(&text_mutex); - __disarm_kprobe(kp, true); - mutex_unlock(&text_mutex); -} - -/* - * Aggregate handlers for multiple kprobes support - these handlers - * take care of invoking the individual kprobe handlers on p->list - */ -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler && likely(!kprobe_disabled(kp))) { - set_kprobe_instance(kp); - if (kp->pre_handler(kp, regs)) - return 1; - } - reset_kprobe_instance(); - } - return 0; -} - -static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler && likely(!kprobe_disabled(kp))) { - set_kprobe_instance(kp); - kp->post_handler(kp, regs, flags); - reset_kprobe_instance(); - } - } -} - -static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - - /* - * if we faulted "during" the execution of a user specified - * probe handler, invoke just that probe's fault handler - */ - if (cur && cur->fault_handler) { - if (cur->fault_handler(cur, regs, trapnr)) - return 1; - } - return 0; -} - -static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - int ret = 0; - - if (cur && cur->break_handler) { - if (cur->break_handler(cur, regs)) - ret = 1; - } - reset_kprobe_instance(); - return ret; -} - -/* Walks the list and increments nmissed count for multiprobe case */ -void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) -{ - struct kprobe *kp; - if (!kprobe_aggrprobe(p)) { - p->nmissed++; - } else { - list_for_each_entry_rcu(kp, &p->list, list) - kp->nmissed++; - } - return; -} - -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) -{ - struct kretprobe *rp = ri->rp; - - /* remove rp inst off the rprobe_inst_table */ - hlist_del(&ri->hlist); - INIT_HLIST_NODE(&ri->hlist); - if (likely(rp)) { - raw_spin_lock(&rp->lock); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock(&rp->lock); - } else - /* Unregistering */ - hlist_add_head(&ri->hlist, head); -} - -void __kprobes kretprobe_hash_lock(struct task_struct *tsk, - struct hlist_head **head, unsigned long *flags) -__acquires(hlist_lock) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; - - *head = &kretprobe_inst_table[hash]; - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} - -static void __kprobes kretprobe_table_lock(unsigned long hash, - unsigned long *flags) -__acquires(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} - -void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) -__releases(hlist_lock) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; - - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); -} - -static void __kprobes kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) -__releases(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); -} - -/* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated - * with this task. These left over instances represent probed functions - * that have been called but will never return. - */ -void __kprobes kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long hash, flags = 0; - - if (unlikely(!kprobes_initialized)) - /* Early boot. kretprobe_table_locks not yet initialized. */ - return; - - INIT_HLIST_HEAD(&empty_rp); - hash = hash_ptr(tk, KPROBE_HASH_BITS); - head = &kretprobe_inst_table[hash]; - kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri, &empty_rp); - } - kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } -} - -static inline void free_rp_inst(struct kretprobe *rp) -{ - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; - - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } -} - -static void __kprobes cleanup_rp_inst(struct kretprobe *rp) -{ - unsigned long flags, hash; - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; - struct hlist_head *head; - - /* No race here */ - for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { - kretprobe_table_lock(hash, &flags); - head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, pos, next, head, hlist) { - if (ri->rp == rp) - ri->rp = NULL; - } - kretprobe_table_unlock(hash, &flags); - } - free_rp_inst(rp); -} - -/* -* Add the new probe to ap->list. Fail if this is the -* second jprobe at the address - two jprobes can't coexist -*/ -static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) -{ - BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); - - if (p->break_handler || p->post_handler) - unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ - - if (p->break_handler) { - if (ap->break_handler) - return -EEXIST; - list_add_tail_rcu(&p->list, &ap->list); - ap->break_handler = aggr_break_handler; - } else - list_add_rcu(&p->list, &ap->list); - if (p->post_handler && !ap->post_handler) - ap->post_handler = aggr_post_handler; - - if (kprobe_disabled(ap) && !kprobe_disabled(p)) { - ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) - /* Arm the breakpoint again. */ - __arm_kprobe(ap); - } - return 0; -} - -/* - * Fill in the required fields of the "manager kprobe". Replace the - * earlier kprobe in the hlist with the manager kprobe - */ -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) -{ - /* Copy p's insn slot to ap */ - copy_kprobe(p, ap); - flush_insn_slot(ap); - ap->addr = p->addr; - ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; - ap->pre_handler = aggr_pre_handler; - ap->fault_handler = aggr_fault_handler; - /* We don't care the kprobe which has gone. */ - if (p->post_handler && !kprobe_gone(p)) - ap->post_handler = aggr_post_handler; - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = aggr_break_handler; - - INIT_LIST_HEAD(&ap->list); - INIT_HLIST_NODE(&ap->hlist); - - list_add_rcu(&p->list, &ap->list); - hlist_replace_rcu(&p->hlist, &ap->hlist); -} - -/* - * This is the second or subsequent kprobe at the address - handle - * the intricacies - */ -static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, - struct kprobe *p) -{ - int ret = 0; - struct kprobe *ap = orig_p; - - if (!kprobe_aggrprobe(orig_p)) { - /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ - ap = alloc_aggr_kprobe(orig_p); - if (!ap) - return -ENOMEM; - init_aggr_kprobe(ap, orig_p); - } else if (kprobe_unused(ap)) - /* This probe is going to die. Rescue it */ - reuse_unused_kprobe(ap); - - if (kprobe_gone(ap)) { - /* - * Attempting to insert new probe at the same location that - * had a probe in the module vaddr area which already - * freed. So, the instruction slot has already been - * released. We need a new slot for the new probe. - */ - ret = arch_prepare_kprobe(ap); - if (ret) - /* - * Even if fail to allocate new slot, don't need to - * free aggr_probe. It will be used next time, or - * freed by unregister_kprobe. - */ - return ret; - - /* Prepare optimized instructions if possible. */ - prepare_optimized_kprobe(ap); - - /* - * Clear gone flag to prevent allocating new slot again, and - * set disabled flag because it is not armed yet. - */ - ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) - | KPROBE_FLAG_DISABLED; - } - - /* Copy ap's insn slot to p */ - copy_kprobe(ap, p); - return add_new_kprobe(ap, p); -} - -static int __kprobes in_kprobes_functions(unsigned long addr) -{ - struct kprobe_blackpoint *kb; - - if (addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) - return -EINVAL; - /* - * If there exists a kprobe_blacklist, verify and - * fail any probe registration in the prohibited area - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } - } - return 0; -} - -/* - * If we have a symbol_name argument, look it up and add the offset field - * to it. This way, we can specify a relative address to a symbol. - * This returns encoded errors if it fails to look up symbol or invalid - * combination of parameters. - */ -static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) -{ - kprobe_opcode_t *addr = p->addr; - - if ((p->symbol_name && p->addr) || - (!p->symbol_name && !p->addr)) - goto invalid; - - if (p->symbol_name) { - kprobe_lookup_name(p->symbol_name, addr); - if (!addr) - return ERR_PTR(-ENOENT); - } - - addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); - if (addr) - return addr; - -invalid: - return ERR_PTR(-EINVAL); -} - -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *ap, *list_p; - - ap = get_kprobe(p->addr); - if (unlikely(!ap)) - return NULL; - - if (p != ap) { - list_for_each_entry_rcu(list_p, &ap->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return ap; -} - -/* Return error if the kprobe is being re-registered */ -static inline int check_kprobe_rereg(struct kprobe *p) -{ - int ret = 0; - - mutex_lock(&kprobe_mutex); - if (__get_valid_kprobe(p)) - ret = -EINVAL; - mutex_unlock(&kprobe_mutex); - - return ret; -} - -int __kprobes register_kprobe(struct kprobe *p) -{ - int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (IS_ERR(addr)) - return PTR_ERR(addr); - p->addr = addr; - - ret = check_kprobe_rereg(p); - if (ret) - return ret; - - jump_label_lock(); - preempt_disable(); - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { - ret = -EINVAL; - goto cannot_probe; - } - - /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ - p->flags &= KPROBE_FLAG_DISABLED; - - /* - * Check if are we probing a module. - */ - probed_mod = __module_text_address((unsigned long) p->addr); - if (probed_mod) { - /* Return -ENOENT if fail. */ - ret = -ENOENT; - /* - * We must hold a refcount of the probed module while updating - * its code to prohibit unexpected unloading. - */ - if (unlikely(!try_module_get(probed_mod))) - goto cannot_probe; - - /* - * If the module freed .init.text, we couldn't insert - * kprobes in there. - */ - if (within_module_init((unsigned long)p->addr, probed_mod) && - probed_mod->state != MODULE_STATE_COMING) { - module_put(probed_mod); - goto cannot_probe; - } - /* ret will be updated by following code */ - } - preempt_enable(); - jump_label_unlock(); - - p->nmissed = 0; - INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); - - jump_label_lock(); /* needed to call jump_label_text_reserved() */ - - get_online_cpus(); /* For avoiding text_mutex deadlock. */ - mutex_lock(&text_mutex); - - old_p = get_kprobe(p->addr); - if (old_p) { - /* Since this may unoptimize old_p, locking text_mutex. */ - ret = register_aggr_kprobe(old_p, p); - goto out; - } - - ret = arch_prepare_kprobe(p); - if (ret) - goto out; - - INIT_HLIST_NODE(&p->hlist); - hlist_add_head_rcu(&p->hlist, - &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - __arm_kprobe(p); - - /* Try to optimize kprobe */ - try_to_optimize_kprobe(p); - -out: - mutex_unlock(&text_mutex); - put_online_cpus(); - jump_label_unlock(); - mutex_unlock(&kprobe_mutex); - - if (probed_mod) - module_put(probed_mod); - - return ret; - -cannot_probe: - preempt_enable(); - jump_label_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(register_kprobe); - -/* Check if all probes on the aggrprobe are disabled */ -static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &ap->list, list) - if (!kprobe_disabled(kp)) - /* - * There is an active probe on the list. - * We can't disable this ap. - */ - return 0; - - return 1; -} - -/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ -static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) -{ - struct kprobe *orig_p; - - /* Get an original kprobe for return */ - orig_p = __get_valid_kprobe(p); - if (unlikely(orig_p == NULL)) - return NULL; - - if (!kprobe_disabled(p)) { - /* Disable probe if it is a child probe */ - if (p != orig_p) - p->flags |= KPROBE_FLAG_DISABLED; - - /* Try to disarm and disable this/parent probe */ - if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p); - orig_p->flags |= KPROBE_FLAG_DISABLED; - } - } - - return orig_p; -} - -/* - * Unregister a kprobe without a scheduler synchronization. - */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) -{ - struct kprobe *ap, *list_p; - - /* Disable kprobe. This will disarm it if needed. */ - ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; - - if (ap == p) - /* - * This probe is an independent(and non-optimized) kprobe - * (not an aggrprobe). Remove from the hash list. - */ - goto disarmed; - - /* Following process expects this probe is an aggrprobe */ - WARN_ON(!kprobe_aggrprobe(ap)); - - if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) - /* - * !disarmed could be happen if the probe is under delayed - * unoptimizing. - */ - goto disarmed; - else { - /* If disabling probe has special handlers, update aggrprobe */ - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = NULL; - if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry_rcu(list_p, &ap->list, list) { - if ((list_p != p) && (list_p->post_handler)) - goto noclean; - } - ap->post_handler = NULL; - } -noclean: - /* - * Remove from the aggrprobe: this path will do nothing in - * __unregister_kprobe_bottom(). - */ - list_del_rcu(&p->list); - if (!kprobe_disabled(ap) && !kprobes_all_disarmed) - /* - * Try to optimize this probe again, because post - * handler may have been changed. - */ - optimize_kprobe(ap); - } - return 0; - -disarmed: - BUG_ON(!kprobe_disarmed(ap)); - hlist_del_rcu(&ap->hlist); - return 0; -} - -static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) -{ - struct kprobe *ap; - - if (list_empty(&p->list)) - /* This is an independent kprobe */ - arch_remove_kprobe(p); - else if (list_is_singular(&p->list)) { - /* This is the last child of an aggrprobe */ - ap = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - free_aggr_kprobe(ap); - } - /* Otherwise, do nothing. */ -} - -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - int i, ret = 0; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - ret = register_kprobe(kps[i]); - if (ret < 0) { - if (i > 0) - unregister_kprobes(kps, i); - break; - } - } - return ret; -} -EXPORT_SYMBOL_GPL(register_kprobes); - -void __kprobes unregister_kprobe(struct kprobe *p) -{ - unregister_kprobes(&p, 1); -} -EXPORT_SYMBOL_GPL(unregister_kprobe); - -void __kprobes unregister_kprobes(struct kprobe **kps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(kps[i]) < 0) - kps[i]->addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) - if (kps[i]->addr) - __unregister_kprobe_bottom(kps[i]); -} -EXPORT_SYMBOL_GPL(unregister_kprobes); - -static struct notifier_block kprobe_exceptions_nb = { - .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to be notified first */ -}; - -unsigned long __weak arch_deref_entry_point(void *entry) -{ - return (unsigned long)entry; -} - -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - struct jprobe *jp; - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - unsigned long addr, offset; - jp = jps[i]; - addr = arch_deref_entry_point(jp->entry); - - /* Verify probepoint is a function entry point */ - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && - offset == 0) { - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); - } else - ret = -EINVAL; - - if (ret < 0) { - if (i > 0) - unregister_jprobes(jps, i); - break; - } - } - return ret; -} -EXPORT_SYMBOL_GPL(register_jprobes); - -int __kprobes register_jprobe(struct jprobe *jp) -{ - return register_jprobes(&jp, 1); -} -EXPORT_SYMBOL_GPL(register_jprobe); - -void __kprobes unregister_jprobe(struct jprobe *jp) -{ - unregister_jprobes(&jp, 1); -} -EXPORT_SYMBOL_GPL(unregister_jprobe); - -void __kprobes unregister_jprobes(struct jprobe **jps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&jps[i]->kp) < 0) - jps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (jps[i]->kp.addr) - __unregister_kprobe_bottom(&jps[i]->kp); - } -} -EXPORT_SYMBOL_GPL(unregister_jprobes); - -#ifdef CONFIG_KRETPROBES -/* - * This kprobe pre_handler is registered with every kretprobe. When probe - * hits it will set up the return probe. - */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long hash, flags = 0; - struct kretprobe_instance *ri; - - /*TODO: consider to only swap the RA after the last pre_handler fired */ - hash = hash_ptr(current, KPROBE_HASH_BITS); - raw_spin_lock_irqsave(&rp->lock, flags); - if (!hlist_empty(&rp->free_instances)) { - ri = hlist_entry(rp->free_instances.first, - struct kretprobe_instance, hlist); - hlist_del(&ri->hlist); - raw_spin_unlock_irqrestore(&rp->lock, flags); - - ri->rp = rp; - ri->task = current; - - if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock_irqrestore(&rp->lock, flags); - return 0; - } - - arch_prepare_kretprobe(ri, regs); - - /* XXX(hch): why is there no hlist_move_head? */ - INIT_HLIST_NODE(&ri->hlist); - kretprobe_table_lock(hash, &flags); - hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); - kretprobe_table_unlock(hash, &flags); - } else { - rp->nmissed++; - raw_spin_unlock_irqrestore(&rp->lock, flags); - } - return 0; -} - -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - int ret = 0; - struct kretprobe_instance *inst; - int i; - void *addr; - - if (kretprobe_blacklist_size) { - addr = kprobe_addr(&rp->kp); - if (IS_ERR(addr)) - return PTR_ERR(addr); - - for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - if (kretprobe_blacklist[i].addr == addr) - return -EINVAL; - } - } - - rp->kp.pre_handler = pre_handler_kretprobe; - rp->kp.post_handler = NULL; - rp->kp.fault_handler = NULL; - rp->kp.break_handler = NULL; - - /* Pre-allocate memory for max kretprobe instances */ - if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT - rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); -#else - rp->maxactive = num_possible_cpus(); -#endif - } - raw_spin_lock_init(&rp->lock); - INIT_HLIST_HEAD(&rp->free_instances); - for (i = 0; i < rp->maxactive; i++) { - inst = kmalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); - if (inst == NULL) { - free_rp_inst(rp); - return -ENOMEM; - } - INIT_HLIST_NODE(&inst->hlist); - hlist_add_head(&inst->hlist, &rp->free_instances); - } - - rp->nmissed = 0; - /* Establish function entry probe point */ - ret = register_kprobe(&rp->kp); - if (ret != 0) - free_rp_inst(rp); - return ret; -} -EXPORT_SYMBOL_GPL(register_kretprobe); - -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - for (i = 0; i < num; i++) { - ret = register_kretprobe(rps[i]); - if (ret < 0) { - if (i > 0) - unregister_kretprobes(rps, i); - break; - } - } - return ret; -} -EXPORT_SYMBOL_GPL(register_kretprobes); - -void __kprobes unregister_kretprobe(struct kretprobe *rp) -{ - unregister_kretprobes(&rp, 1); -} -EXPORT_SYMBOL_GPL(unregister_kretprobe); - -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&rps[i]->kp) < 0) - rps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (rps[i]->kp.addr) { - __unregister_kprobe_bottom(&rps[i]->kp); - cleanup_rp_inst(rps[i]); - } - } -} -EXPORT_SYMBOL_GPL(unregister_kretprobes); - -#else /* CONFIG_KRETPROBES */ -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return -ENOSYS; -} -EXPORT_SYMBOL_GPL(register_kretprobe); - -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return -ENOSYS; -} -EXPORT_SYMBOL_GPL(register_kretprobes); - -void __kprobes unregister_kretprobe(struct kretprobe *rp) -{ -} -EXPORT_SYMBOL_GPL(unregister_kretprobe); - -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) -{ -} -EXPORT_SYMBOL_GPL(unregister_kretprobes); - -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_KRETPROBES */ - -/* Set the kprobe gone and remove its instruction buffer. */ -static void __kprobes kill_kprobe(struct kprobe *p) -{ - struct kprobe *kp; - - p->flags |= KPROBE_FLAG_GONE; - if (kprobe_aggrprobe(p)) { - /* - * If this is an aggr_kprobe, we have to list all the - * chained probes and mark them GONE. - */ - list_for_each_entry_rcu(kp, &p->list, list) - kp->flags |= KPROBE_FLAG_GONE; - p->post_handler = NULL; - p->break_handler = NULL; - kill_optimized_kprobe(p); - } - /* - * Here, we can remove insn_slot safely, because no thread calls - * the original probed function (which will be freed soon) any more. - */ - arch_remove_kprobe(p); -} - -/* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) -{ - int ret = 0; - - mutex_lock(&kprobe_mutex); - - /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; - - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(disable_kprobe); - -/* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - if (kprobe_gone(kp)) { - /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } - - if (p != kp) - kp->flags &= ~KPROBE_FLAG_DISABLED; - - if (!kprobes_all_disarmed && kprobe_disabled(p)) { - p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); - } -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(enable_kprobe); - -void __kprobes dump_kprobe(struct kprobe *kp) -{ - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", - kp->symbol_name, kp->addr, kp->offset); -} - -/* Module notifier call back, checking kprobes on the module */ -static int __kprobes kprobes_module_callback(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct module *mod = data; - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - int checkcore = (val == MODULE_STATE_GOING); - - if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) - return NOTIFY_DONE; - - /* - * When MODULE_STATE_GOING was notified, both of module .text and - * .init.text sections would be freed. When MODULE_STATE_LIVE was - * notified, only .init.text section would be freed. We need to - * disable kprobes which have been inserted in the sections. - */ - mutex_lock(&kprobe_mutex); - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) - if (within_module_init((unsigned long)p->addr, mod) || - (checkcore && - within_module_core((unsigned long)p->addr, mod))) { - /* - * The vaddr this probe is installed will soon - * be vfreed buy not synced to disk. Hence, - * disarming the breakpoint isn't needed. - */ - kill_kprobe(p); - } - } - mutex_unlock(&kprobe_mutex); - return NOTIFY_DONE; -} - -static struct notifier_block kprobe_module_nb = { - .notifier_call = kprobes_module_callback, - .priority = 0 -}; - -static int __init init_kprobes(void) -{ - int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[128]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; - - /* FIXME allocate the probe table, currently defined statically */ - /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - INIT_HLIST_HEAD(&kprobe_table[i]); - INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); - } - - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; - } - - if (kretprobe_blacklist_size) { - /* lookup the function address from its name */ - for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - kprobe_lookup_name(kretprobe_blacklist[i].name, - kretprobe_blacklist[i].addr); - if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", - kretprobe_blacklist[i].name); - } - } - -#if defined(CONFIG_OPTPROBES) -#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) - /* Init kprobe_optinsn_slots */ - kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; -#endif - /* By default, kprobes can be optimized */ - kprobes_allow_optimization = true; -#endif - - /* By default, kprobes are armed */ - kprobes_all_disarmed = false; - - err = arch_init_kprobes(); - if (!err) - err = register_die_notifier(&kprobe_exceptions_nb); - if (!err) - err = register_module_notifier(&kprobe_module_nb); - - kprobes_initialized = (err == 0); - - if (!err) - init_test_probes(); - return err; -} - -#ifdef CONFIG_DEBUG_FS -static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset, char *modname, struct kprobe *pp) -{ - char *kprobe_type; - - if (p->pre_handler == pre_handler_kretprobe) - kprobe_type = "r"; - else if (p->pre_handler == setjmp_pre_handler) - kprobe_type = "j"; - else - kprobe_type = "k"; - - if (sym) - seq_printf(pi, "%p %s %s+0x%x %s ", - p->addr, kprobe_type, sym, offset, - (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p ", - p->addr, kprobe_type, p->addr); - - if (!pp) - pp = p; - seq_printf(pi, "%s%s%s\n", - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), - (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); -} - -static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) -{ - return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; -} - -static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) -{ - (*pos)++; - if (*pos >= KPROBE_TABLE_SIZE) - return NULL; - return pos; -} - -static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - -static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p, *kp; - const char *sym = NULL; - unsigned int i = *(loff_t *) v; - unsigned long offset = 0; - char *modname, namebuf[128]; - - head = &kprobe_table[i]; - preempt_disable(); - hlist_for_each_entry_rcu(p, node, head, hlist) { - sym = kallsyms_lookup((unsigned long)p->addr, NULL, - &offset, &modname, namebuf); - if (kprobe_aggrprobe(p)) { - list_for_each_entry_rcu(kp, &p->list, list) - report_probe(pi, kp, sym, offset, modname, p); - } else - report_probe(pi, p, sym, offset, modname, NULL); - } - preempt_enable(); - return 0; -} - -static const struct seq_operations kprobes_seq_ops = { - .start = kprobe_seq_start, - .next = kprobe_seq_next, - .stop = kprobe_seq_stop, - .show = show_kprobe_addr -}; - -static int __kprobes kprobes_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobes_seq_ops); -} - -static const struct file_operations debugfs_kprobes_operations = { - .open = kprobes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void __kprobes arm_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - mutex_lock(&kprobe_mutex); - - /* If kprobes are armed, just return */ - if (!kprobes_all_disarmed) - goto already_enabled; - - /* Arming kprobes doesn't optimize kprobe itself */ - mutex_lock(&text_mutex); - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) - if (!kprobe_disabled(p)) - __arm_kprobe(p); - } - mutex_unlock(&text_mutex); - - kprobes_all_disarmed = false; - printk(KERN_INFO "Kprobes globally enabled\n"); - -already_enabled: - mutex_unlock(&kprobe_mutex); - return; -} - -static void __kprobes disarm_all_kprobes(void) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kprobe *p; - unsigned int i; - - mutex_lock(&kprobe_mutex); - - /* If kprobes are already disarmed, just return */ - if (kprobes_all_disarmed) { - mutex_unlock(&kprobe_mutex); - return; - } - - kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); - - mutex_lock(&text_mutex); - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { - head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p, false); - } - } - mutex_unlock(&text_mutex); - mutex_unlock(&kprobe_mutex); - - /* Wait for disarming all kprobes by optimizer */ - wait_for_kprobe_optimizer(); -} - -/* - * XXX: The debugfs bool file interface doesn't allow for callbacks - * when the bool state is switched. We can reuse that facility when - * available - */ -static ssize_t read_enabled_file_bool(struct file *file, - char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[3]; - - if (!kprobes_all_disarmed) - buf[0] = '1'; - else - buf[0] = '0'; - buf[1] = '\n'; - buf[2] = 0x00; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t write_enabled_file_bool(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[32]; - size_t buf_size; - - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - arm_all_kprobes(); - break; - case 'n': - case 'N': - case '0': - disarm_all_kprobes(); - break; - } - - return count; -} - -static const struct file_operations fops_kp = { - .read = read_enabled_file_bool, - .write = write_enabled_file_bool, - .llseek = default_llseek, -}; - -static int __kprobes debugfs_kprobe_init(void) -{ - struct dentry *dir, *file; - unsigned int value = 1; - - dir = debugfs_create_dir("kprobes", NULL); - if (!dir) - return -ENOMEM; - - file = debugfs_create_file("list", 0444, dir, NULL, - &debugfs_kprobes_operations); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } - - file = debugfs_create_file("enabled", 0600, dir, - &value, &fops_kp); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } - - return 0; -} - -late_initcall(debugfs_kprobe_init); -#endif /* CONFIG_DEBUG_FS */ - -module_init(init_kprobes); - -/* defined in arch/.../kernel/kprobes.c */ -EXPORT_SYMBOL_GPL(jprobe_return); -/* - * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which - * are not related to any other subsystem - * - * Copyright (C) 2004 Kay Sievers - * - * This file is release under the GPLv2 - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define KERNEL_ATTR_RO(_name) \ -static struct kobj_attribute _name##_attr = __ATTR_RO(_name) - -#define KERNEL_ATTR_RW(_name) \ -static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) - -#if defined(CONFIG_HOTPLUG) -/* current uevent sequence number */ -static ssize_t uevent_seqnum_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); -} -KERNEL_ATTR_RO(uevent_seqnum); - -/* uevent helper program, used during early boot */ -static ssize_t uevent_helper_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", uevent_helper); -} -static ssize_t uevent_helper_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (count+1 > UEVENT_HELPER_PATH_LEN) - return -ENOENT; - memcpy(uevent_helper, buf, count); - uevent_helper[count] = '\0'; - if (count && uevent_helper[count-1] == '\n') - uevent_helper[count-1] = '\0'; - return count; -} -KERNEL_ATTR_RW(uevent_helper); -#endif - -#ifdef CONFIG_PROFILING -static ssize_t profiling_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", prof_on); -} -static ssize_t profiling_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int ret; - - if (prof_on) - return -EEXIST; - /* - * This eventually calls into get_option() which - * has a ton of callers and is not const. It is - * easiest to cast it away here. - */ - profile_setup((char *)buf); - ret = profile_init(); - if (ret) - return ret; - ret = create_proc_profile(); - if (ret) - return ret; - return count; -} -KERNEL_ATTR_RW(profiling); -#endif - -#ifdef CONFIG_KEXEC -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", !!kexec_crash_image); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%zu\n", crash_get_memory_size()); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (strict_strtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -static ssize_t vmcoreinfo_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), - (unsigned int)vmcoreinfo_max_size); -} -KERNEL_ATTR_RO(vmcoreinfo); - -#endif /* CONFIG_KEXEC */ - -/* whether file capabilities are enabled */ -static ssize_t fscaps_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", file_caps_enabled); -} -KERNEL_ATTR_RO(fscaps); - -/* - * Make /sys/kernel/notes give the raw contents of our kernel .notes section. - */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); -#define notes_size (&__stop_notes - &__start_notes) - -static ssize_t notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) -{ - memcpy(buf, &__start_notes + off, count); - return count; -} - -static struct bin_attribute notes_attr = { - .attr = { - .name = "notes", - .mode = S_IRUGO, - }, - .read = ¬es_read, -}; - -struct kobject *kernel_kobj; -EXPORT_SYMBOL_GPL(kernel_kobj); - -static struct attribute * kernel_attrs[] = { - &fscaps_attr.attr, -#if defined(CONFIG_HOTPLUG) - &uevent_seqnum_attr.attr, - &uevent_helper_attr.attr, -#endif -#ifdef CONFIG_PROFILING - &profiling_attr.attr, -#endif -#ifdef CONFIG_KEXEC - &kexec_loaded_attr.attr, - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, - &vmcoreinfo_attr.attr, -#endif - NULL -}; - -static struct attribute_group kernel_attr_group = { - .attrs = kernel_attrs, -}; - -static int __init ksysfs_init(void) -{ - int error; - - kernel_kobj = kobject_create_and_add("kernel", NULL); - if (!kernel_kobj) { - error = -ENOMEM; - goto exit; - } - error = sysfs_create_group(kernel_kobj, &kernel_attr_group); - if (error) - goto kset_exit; - - if (notes_size > 0) { - notes_attr.size = notes_size; - error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); - if (error) - goto group_exit; - } - - return 0; - -group_exit: - sysfs_remove_group(kernel_kobj, &kernel_attr_group); -kset_exit: - kobject_put(kernel_kobj); -exit: - return error; -} - -core_initcall(ksysfs_init); -/* Kernel thread helper functions. - * Copyright (C) 2004 IBM Corporation, Rusty Russell. - * - * Creation is done via kthreadd, so that we get a clean environment - * even if we're invoked from userspace (think modprobe, hotplug cpu, - * etc.). - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(kthread_create_lock); -static LIST_HEAD(kthread_create_list); -struct task_struct *kthreadd_task; - -struct kthread_create_info -{ - /* Information passed to kthread() from kthreadd. */ - int (*threadfn)(void *data); - void *data; - int node; - - /* Result passed back to kthread_create() from kthreadd. */ - struct task_struct *result; - struct completion done; - - struct list_head list; -}; - -struct kthread { - int should_stop; - void *data; - struct completion exited; -}; - -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) - -/** - * kthread_should_stop - should this kthread return now? - * - * When someone calls kthread_stop() on your kthread, it will be woken - * and this will return true. You should then return, and your return - * value will be passed through to kthread_stop(). - */ -int kthread_should_stop(void) -{ - return to_kthread(current)->should_stop; -} -EXPORT_SYMBOL(kthread_should_stop); - -/** - * kthread_freezable_should_stop - should this freezable kthread return now? - * @was_frozen: optional out parameter, indicates whether %current was frozen - * - * kthread_should_stop() for freezable kthreads, which will enter - * refrigerator if necessary. This function is safe from kthread_stop() / - * freezer deadlock and freezable kthreads should use this function instead - * of calling try_to_freeze() directly. - */ -bool kthread_freezable_should_stop(bool *was_frozen) -{ - bool frozen = false; - - might_sleep(); - - if (unlikely(freezing(current))) - frozen = __refrigerator(true); - - if (was_frozen) - *was_frozen = frozen; - - return kthread_should_stop(); -} -EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); - -/** - * kthread_data - return data value specified on kthread creation - * @task: kthread task in question - * - * Return the data value specified when kthread @task was created. - * The caller is responsible for ensuring the validity of @task when - * calling this function. - */ -void *kthread_data(struct task_struct *task) -{ - return to_kthread(task)->data; -} - -static int kthread(void *_create) -{ - /* Copy data: it's on kthread's stack */ - struct kthread_create_info *create = _create; - int (*threadfn)(void *data) = create->threadfn; - void *data = create->data; - struct kthread self; - int ret; - - self.should_stop = 0; - self.data = data; - init_completion(&self.exited); - current->vfork_done = &self.exited; - - /* OK, tell user we're spawned, wait for stop or wakeup */ - __set_current_state(TASK_UNINTERRUPTIBLE); - create->result = current; - complete(&create->done); - schedule(); - - ret = -EINTR; - if (!self.should_stop) - ret = threadfn(data); - - /* we can't just return, we must preserve "self" on stack */ - do_exit(ret); -} - -/* called from do_fork() to get node information for about to be created task */ -int tsk_fork_get_node(struct task_struct *tsk) -{ -#ifdef CONFIG_NUMA - if (tsk == kthreadd_task) - return tsk->pref_node_fork; -#endif - return numa_node_id(); -} - -static void create_kthread(struct kthread_create_info *create) -{ - int pid; - -#ifdef CONFIG_NUMA - current->pref_node_fork = create->node; -#endif - /* We want our own signal handler (we take no signals by default). */ - pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); - if (pid < 0) { - create->result = ERR_PTR(pid); - complete(&create->done); - } -} - -/** - * kthread_create_on_node - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @node: memory node number. - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). - * - * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which no one will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM). - */ -struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, - int node, - const char namefmt[], - ...) -{ - struct kthread_create_info create; - - create.threadfn = threadfn; - create.data = data; - create.node = node; - init_completion(&create.done); - - spin_lock(&kthread_create_lock); - list_add_tail(&create.list, &kthread_create_list); - spin_unlock(&kthread_create_lock); - - wake_up_process(kthreadd_task); - wait_for_completion(&create.done); - - if (!IS_ERR(create.result)) { - static const struct sched_param param = { .sched_priority = 0 }; - va_list args; - - va_start(args, namefmt); - vsnprintf(create.result->comm, sizeof(create.result->comm), - namefmt, args); - va_end(args); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(create.result, cpu_all_mask); - } - return create.result; -} -EXPORT_SYMBOL(kthread_create_on_node); - -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** - * kthread_stop - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. This can also be called after kthread_create() - * instead of calling wake_up_process(): the thread will exit without - * calling threadfn(). - * - * If threadfn() may call do_exit() itself, the caller must ensure - * task_struct can't go away. - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop(struct task_struct *k) -{ - struct kthread *kthread; - int ret; - - trace_sched_kthread_stop(k); - get_task_struct(k); - - kthread = to_kthread(k); - barrier(); /* it might have exited */ - if (k->vfork_done != NULL) { - kthread->should_stop = 1; - wake_up_process(k); - wait_for_completion(&kthread->exited); - } - ret = k->exit_code; - - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); - - return ret; -} -EXPORT_SYMBOL(kthread_stop); - -int kthreadd(void *unused) -{ - struct task_struct *tsk = current; - - /* Setup a clean context for our children to inherit. */ - set_task_comm(tsk, "kthreadd"); - ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_states[N_HIGH_MEMORY]); - - current->flags |= PF_NOFREEZE; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty(&kthread_create_list)) - schedule(); - __set_current_state(TASK_RUNNING); - - spin_lock(&kthread_create_lock); - while (!list_empty(&kthread_create_list)) { - struct kthread_create_info *create; - - create = list_entry(kthread_create_list.next, - struct kthread_create_info, list); - list_del_init(&create->list); - spin_unlock(&kthread_create_lock); - - create_kthread(create); - - spin_lock(&kthread_create_lock); - } - spin_unlock(&kthread_create_lock); - } - - return 0; -} - -void __init_kthread_worker(struct kthread_worker *worker, - const char *name, - struct lock_class_key *key) -{ - spin_lock_init(&worker->lock); - lockdep_set_class_and_name(&worker->lock, key, name); - INIT_LIST_HEAD(&worker->work_list); - worker->task = NULL; -} -EXPORT_SYMBOL_GPL(__init_kthread_worker); - -/** - * kthread_worker_fn - kthread function to process kthread_worker - * @worker_ptr: pointer to initialized kthread_worker - * - * This function can be used as @threadfn to kthread_create() or - * kthread_run() with @worker_ptr argument pointing to an initialized - * kthread_worker. The started kthread will process work_list until - * the it is stopped with kthread_stop(). A kthread can also call - * this function directly after extra initialization. - * - * Different kthreads can be used for the same kthread_worker as long - * as there's only one kthread attached to it at any given time. A - * kthread_worker without an attached kthread simply collects queued - * kthread_works. - */ -int kthread_worker_fn(void *worker_ptr) -{ - struct kthread_worker *worker = worker_ptr; - struct kthread_work *work; - - WARN_ON(worker->task); - worker->task = current; -repeat: - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - spin_lock_irq(&worker->lock); - worker->task = NULL; - spin_unlock_irq(&worker->lock); - return 0; - } - - work = NULL; - spin_lock_irq(&worker->lock); - if (!list_empty(&worker->work_list)) { - work = list_first_entry(&worker->work_list, - struct kthread_work, node); - list_del_init(&work->node); - } - spin_unlock_irq(&worker->lock); - - if (work) { - __set_current_state(TASK_RUNNING); - work->func(work); - smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ - work->done_seq = work->queue_seq; - smp_mb(); /* mb worker-b1 paired with flush-b0 */ - if (atomic_read(&work->flushing)) - wake_up_all(&work->done); - } else if (!freezing(current)) - schedule(); - - try_to_freeze(); - goto repeat; -} -EXPORT_SYMBOL_GPL(kthread_worker_fn); - -/** - * queue_kthread_work - queue a kthread_work - * @worker: target kthread_worker - * @work: kthread_work to queue - * - * Queue @work to work processor @task for async execution. @task - * must have been created with kthread_worker_create(). Returns %true - * if @work was successfully queued, %false if it was already pending. - */ -bool queue_kthread_work(struct kthread_worker *worker, - struct kthread_work *work) -{ - bool ret = false; - unsigned long flags; - - spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { - list_add_tail(&work->node, &worker->work_list); - work->queue_seq++; - if (likely(worker->task)) - wake_up_process(worker->task); - ret = true; - } - spin_unlock_irqrestore(&worker->lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(queue_kthread_work); - -/** - * flush_kthread_work - flush a kthread_work - * @work: work to flush - * - * If @work is queued or executing, wait for it to finish execution. - */ -void flush_kthread_work(struct kthread_work *work) -{ - int seq = work->queue_seq; - - atomic_inc(&work->flushing); - - /* - * mb flush-b0 paired with worker-b1, to make sure either - * worker sees the above increment or we see done_seq update. - */ - smp_mb__after_atomic_inc(); - - /* A - B <= 0 tests whether B is in front of A regardless of overflow */ - wait_event(work->done, seq - work->done_seq <= 0); - atomic_dec(&work->flushing); - - /* - * rmb flush-b1 paired with worker-b0, to make sure our caller - * sees every change made by work->func(). - */ - smp_mb__after_atomic_dec(); -} -EXPORT_SYMBOL_GPL(flush_kthread_work); - -struct kthread_flush_work { - struct kthread_work work; - struct completion done; -}; - -static void kthread_flush_work_fn(struct kthread_work *work) -{ - struct kthread_flush_work *fwork = - container_of(work, struct kthread_flush_work, work); - complete(&fwork->done); -} - -/** - * flush_kthread_worker - flush all current works on a kthread_worker - * @worker: worker to flush - * - * Wait until all currently executing or pending works on @worker are - * finished. - */ -void flush_kthread_worker(struct kthread_worker *worker) -{ - struct kthread_flush_work fwork = { - KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), - COMPLETION_INITIALIZER_ONSTACK(fwork.done), - }; - - queue_kthread_work(worker, &fwork.work); - wait_for_completion(&fwork.done); -} -EXPORT_SYMBOL_GPL(flush_kthread_worker); -/* - * latencytop.c: Latency display infrastructure - * - * (C) Copyright 2008 Intel Corporation - * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -/* - * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is - * used by the "latencytop" userspace tool. The latency that is tracked is not - * the 'traditional' interrupt latency (which is primarily caused by something - * else consuming CPU), but instead, it is the latency an application encounters - * because the kernel sleeps on its behalf for various reasons. - * - * This code tracks 2 levels of statistics: - * 1) System level latency - * 2) Per process latency - * - * The latency is stored in fixed sized data structures in an accumulated form; - * if the "same" latency cause is hit twice, this will be tracked as one entry - * in the data structure. Both the count, total accumulated latency and maximum - * latency are tracked in this data structure. When the fixed size structure is - * full, no new causes are tracked until the buffer is flushed by writing to - * the /proc file; the userspace tool does this on a regular basis. - * - * A latency cause is identified by a stringified backtrace at the point that - * the scheduler gets invoked. The userland tool will use this string to - * identify the cause of the latency in human readable form. - * - * The information is exported via /proc/latency_stats and /proc//latency. - * These files look like this: - * - * Latency Top version : v0.1 - * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl - * | | | | - * | | | +----> the stringified backtrace - * | | +---------> The maximum latency for this entry in microseconds - * | +--------------> The accumulated latency for this entry (microseconds) - * +-------------------> The number of times this entry is hit - * - * (note: the average latency is the accumulated latency divided by the number - * of times) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_RAW_SPINLOCK(latency_lock); - -#define MAXLR 128 -static struct latency_record latency_record[MAXLR]; - -int latencytop_enabled; - -void clear_all_latency_tracing(struct task_struct *p) -{ - unsigned long flags; - - if (!latencytop_enabled) - return; - - raw_spin_lock_irqsave(&latency_lock, flags); - memset(&p->latency_record, 0, sizeof(p->latency_record)); - p->latency_record_count = 0; - raw_spin_unlock_irqrestore(&latency_lock, flags); -} - -static void clear_global_latency_tracing(void) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&latency_lock, flags); - memset(&latency_record, 0, sizeof(latency_record)); - raw_spin_unlock_irqrestore(&latency_lock, flags); -} - -static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) -{ - int firstnonnull = MAXLR + 1; - int i; - - if (!latencytop_enabled) - return; - - /* skip kernel threads for now */ - if (!tsk->mm) - return; - - for (i = 0; i < MAXLR; i++) { - int q, same = 1; - - /* Nothing stored: */ - if (!latency_record[i].backtrace[0]) { - if (firstnonnull > i) - firstnonnull = i; - continue; - } - for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - unsigned long record = lat->backtrace[q]; - - if (latency_record[i].backtrace[q] != record) { - same = 0; - break; - } - - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) - break; - } - if (same) { - latency_record[i].count++; - latency_record[i].time += lat->time; - if (lat->time > latency_record[i].max) - latency_record[i].max = lat->time; - return; - } - } - - i = firstnonnull; - if (i >= MAXLR - 1) - return; - - /* Allocted a new one: */ - memcpy(&latency_record[i], lat, sizeof(struct latency_record)); -} - -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, - struct latency_record *lat) -{ - struct stack_trace trace; - - memset(&trace, 0, sizeof(trace)); - trace.max_entries = LT_BACKTRACEDEPTH; - trace.entries = &lat->backtrace[0]; - save_stack_trace_tsk(tsk, &trace); -} - -/** - * __account_scheduler_latency - record an occurred latency - * @tsk - the task struct of the task hitting the latency - * @usecs - the duration of the latency in microseconds - * @inter - 1 if the sleep was interruptible, 0 if uninterruptible - * - * This function is the main entry point for recording latency entries - * as called by the scheduler. - * - * This function has a few special cases to deal with normal 'non-latency' - * sleeps: specifically, interruptible sleep longer than 5 msec is skipped - * since this usually is caused by waiting for events via select() and co. - * - * Negative latencies (caused by time going backwards) are also explicitly - * skipped. - */ -void __sched -__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) -{ - unsigned long flags; - int i, q; - struct latency_record lat; - - /* Long interruptible waits are generally user requested... */ - if (inter && usecs > 5000) - return; - - /* Negative sleeps are time going backwards */ - /* Zero-time sleeps are non-interesting */ - if (usecs <= 0) - return; - - memset(&lat, 0, sizeof(lat)); - lat.count = 1; - lat.time = usecs; - lat.max = usecs; - store_stacktrace(tsk, &lat); - - raw_spin_lock_irqsave(&latency_lock, flags); - - account_global_scheduler_latency(tsk, &lat); - - for (i = 0; i < tsk->latency_record_count; i++) { - struct latency_record *mylat; - int same = 1; - - mylat = &tsk->latency_record[i]; - for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - unsigned long record = lat.backtrace[q]; - - if (mylat->backtrace[q] != record) { - same = 0; - break; - } - - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) - break; - } - if (same) { - mylat->count++; - mylat->time += lat.time; - if (lat.time > mylat->max) - mylat->max = lat.time; - goto out_unlock; - } - } - - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - /* Allocated a new one: */ - i = tsk->latency_record_count++; - memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); - -out_unlock: - raw_spin_unlock_irqrestore(&latency_lock, flags); -} - -static int lstats_show(struct seq_file *m, void *v) -{ - int i; - - seq_puts(m, "Latency Top version : v0.1\n"); - - for (i = 0; i < MAXLR; i++) { - struct latency_record *lr = &latency_record[i]; - - if (lr->backtrace[0]) { - int q; - seq_printf(m, "%i %lu %lu", - lr->count, lr->time, lr->max); - for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - unsigned long bt = lr->backtrace[q]; - if (!bt) - break; - if (bt == ULONG_MAX) - break; - seq_printf(m, " %ps", (void *)bt); - } - seq_printf(m, "\n"); - } - } - return 0; -} - -static ssize_t -lstats_write(struct file *file, const char __user *buf, size_t count, - loff_t *offs) -{ - clear_global_latency_tracing(); - - return count; -} - -static int lstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, lstats_show, NULL); -} - -static const struct file_operations lstats_fops = { - .open = lstats_open, - .read = seq_read, - .write = lstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_lstats_procfs(void) -{ - proc_create("latency_stats", 0644, NULL, &lstats_fops); - return 0; -} -device_initcall(init_lstats_procfs); -/* - * kernel/lockdep.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * this code maps all the lock dependencies as they occur in a live kernel - * and will warn about the following classes of locking bugs: - * - * - lock inversion scenarios - * - circular lock dependencies - * - hardirq/softirq safe/unsafe locking bugs - * - * Bugs are reported even if the current locking scenario does not cause - * any deadlock at this point. - * - * I.e. if anytime in the past two locks were taken in a different order, - * even if it happened for another task, even if those were different - * locks (but of the same class as this lock), this code will detect it. - * - * Thanks to Arjan van de Ven for coming up with the initial idea of - * mapping lock dependencies runtime. - */ -#define DISABLE_BRANCH_PROFILING -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "lockdep_internals.h" - -#define CREATE_TRACE_POINTS -#include - -#ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; -module_param(prove_locking, int, 0644); -#else -#define prove_locking 0 -#endif - -#ifdef CONFIG_LOCK_STAT -int lock_stat = 1; -module_param(lock_stat, int, 0644); -#else -#define lock_stat 0 -#endif - -/* - * lockdep_lock: protects the lockdep graph, the hashes and the - * class/list/hash allocators. - * - * This is one of the rare exceptions where it's justified - * to use a raw spinlock - we really dont want the spinlock - * code to recurse back into the lockdep code... - */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - -static int graph_lock(void) -{ - arch_spin_lock(&lockdep_lock); - /* - * Make sure that if another CPU detected a bug while - * walking the graph we dont change it (while the other - * CPU is busy printing out stuff with the graph lock - * dropped already) - */ - if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); - return 0; - } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; - return 1; -} - -static inline int graph_unlock(void) -{ - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; -} - -/* - * Turn lock debugging off and return with 0 if it was off already, - * and also release the graph lock: - */ -static inline int debug_locks_off_graph_unlock(void) -{ - int ret = debug_locks_off(); - - arch_spin_unlock(&lockdep_lock); - - return ret; -} - -static int lockdep_initialized; - -unsigned long nr_list_entries; -static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; - -/* - * All data structures here are protected by the global debug_lock. - * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. - */ -unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; - -static inline struct lock_class *hlock_class(struct held_lock *hlock) -{ - if (!hlock->class_idx) { - /* - * Someone passed in garbage, we give up. - */ - DEBUG_LOCKS_WARN_ON(1); - return NULL; - } - return lock_classes + hlock->class_idx - 1; -} - -#ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); - -static inline u64 lockstat_clock(void) -{ - return local_clock(); -} - -static int lock_point(unsigned long points[], unsigned long ip) -{ - int i; - - for (i = 0; i < LOCKSTAT_POINTS; i++) { - if (points[i] == 0) { - points[i] = ip; - break; - } - if (points[i] == ip) - break; - } - - return i; -} - -static void lock_time_inc(struct lock_time *lt, u64 time) -{ - if (time > lt->max) - lt->max = time; - - if (time < lt->min || !lt->nr) - lt->min = time; - - lt->total += time; - lt->nr++; -} - -static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) -{ - if (!src->nr) - return; - - if (src->max > dst->max) - dst->max = src->max; - - if (src->min < dst->min || !dst->nr) - dst->min = src->min; - - dst->total += src->total; - dst->nr += src->nr; -} - -struct lock_class_stats lock_stats(struct lock_class *class) -{ - struct lock_class_stats stats; - int cpu, i; - - memset(&stats, 0, sizeof(struct lock_class_stats)); - for_each_possible_cpu(cpu) { - struct lock_class_stats *pcs = - &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; - - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; - - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); - - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); - - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; - } - - return stats; -} - -void clear_lock_stats(struct lock_class *class) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct lock_class_stats *cpu_stats = - &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - - memset(cpu_stats, 0, sizeof(struct lock_class_stats)); - } - memset(class->contention_point, 0, sizeof(class->contention_point)); - memset(class->contending_point, 0, sizeof(class->contending_point)); -} - -static struct lock_class_stats *get_lock_stats(struct lock_class *class) -{ - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); -} - -static void lock_release_holdtime(struct held_lock *hlock) -{ - struct lock_class_stats *stats; - u64 holdtime; - - if (!lock_stat) - return; - - holdtime = lockstat_clock() - hlock->holdtime_stamp; - - stats = get_lock_stats(hlock_class(hlock)); - if (hlock->read) - lock_time_inc(&stats->read_holdtime, holdtime); - else - lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); -} -#else -static inline void lock_release_holdtime(struct held_lock *hlock) -{ -} -#endif - -/* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. - */ -LIST_HEAD(all_lock_classes); - -/* - * The lockdep classes are in a hash-table as well, for fast lookup: - */ -#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) -#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) -#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) -#define classhashentry(key) (classhash_table + __classhashfn((key))) - -static struct list_head classhash_table[CLASSHASH_SIZE]; - -/* - * We put the lock dependency chains into a hash-table as well, to cache - * their existence: - */ -#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) -#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) -#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) -#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) - -static struct list_head chainhash_table[CHAINHASH_SIZE]; - -/* - * The hash key of the lock dependency chains is a hash itself too: - * it's a hash of all locks taken up to that lock, including that lock. - * It's a 64-bit hash, because it's important for the keys to be - * unique. - */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) - -void lockdep_off(void) -{ - current->lockdep_recursion++; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion--; -} -EXPORT_SYMBOL(lockdep_on); - -/* - * Debugging switches: - */ - -#define VERBOSE 0 -#define VERY_VERBOSE 0 - -#if VERBOSE -# define HARDIRQ_VERBOSE 1 -# define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 -#else -# define HARDIRQ_VERBOSE 0 -# define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 -#endif - -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE -/* - * Quick filtering for interesting events: - */ -static int class_filter(struct lock_class *class) -{ -#if 0 - /* Example */ - if (class->name_version == 1 && - !strcmp(class->name, "lockname")) - return 1; - if (class->name_version == 1 && - !strcmp(class->name, "&struct->lockfield")) - return 1; -#endif - /* Filter everything else. 1 would be to allow everything else */ - return 0; -} -#endif - -static int verbose(struct lock_class *class) -{ -#if VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - -static int save_trace(struct stack_trace *trace) -{ - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; - - trace->max_entries = trace->nr_entries; - - nr_stack_trace_entries += trace->nr_entries; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return 0; - } - - return 1; -} - -unsigned int nr_hardirq_chains; -unsigned int nr_softirq_chains; -unsigned int nr_process_chains; -unsigned int max_lockdep_depth; - -#ifdef CONFIG_DEBUG_LOCKDEP -/* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* - * Various lockdep statistics: - */ -DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); -#endif - -/* - * Locking printouts: - */ - -#define __USAGE(__STATE) \ - [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ - [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ - [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ - [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", - -static const char *usage_str[] = -{ -#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) -#include "lockdep_states.h" -#undef LOCKDEP_STATE - [LOCK_USED] = "INITIAL USE", -}; - -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) -{ - return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); -} - -static inline unsigned long lock_flag(enum lock_usage_bit bit) -{ - return 1UL << bit; -} - -static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) -{ - char c = '.'; - - if (class->usage_mask & lock_flag(bit + 2)) - c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) - c = '?'; - } - - return c; -} - -void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) -{ - int i = 0; - -#define LOCKDEP_STATE(__STATE) \ - usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ - usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); -#include "lockdep_states.h" -#undef LOCKDEP_STATE - - usage[i] = '\0'; -} - -static void __print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN]; - const char *name; - - name = class->name; - if (!name) { - name = __get_key_name(class->key, str); - printk("%s", name); - } else { - printk("%s", name); - if (class->name_version > 1) - printk("#%d", class->name_version); - if (class->subclass) - printk("/%d", class->subclass); - } -} - -static void print_lock_name(struct lock_class *class) -{ - char usage[LOCK_USAGE_CHARS]; - - get_usage_chars(class, usage); - - printk(" ("); - __print_lock_name(class); - printk("){%s}", usage); -} - -static void print_lockdep_cache(struct lockdep_map *lock) -{ - const char *name; - char str[KSYM_NAME_LEN]; - - name = lock->name; - if (!name) - name = __get_key_name(lock->key->subkeys, str); - - printk("%s", name); -} - -static void print_lock(struct held_lock *hlock) -{ - print_lock_name(hlock_class(hlock)); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); -} - -static void lockdep_print_held_locks(struct task_struct *curr) -{ - int i, depth = curr->lockdep_depth; - - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); - return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - - for (i = 0; i < depth; i++) { - printk(" #%d: ", i); - print_lock(curr->held_locks + i); - } -} - -static void print_kernel_ident(void) -{ - printk("%s %.*s %s\n", init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, - print_tainted()); -} - -static int very_verbose(struct lock_class *class) -{ -#if VERY_VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Is this the address of a static object: - */ -static int static_obj(void *obj) -{ - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; - - /* - * static variable? - */ - if ((addr >= start) && (addr < end)) - return 1; - - if (arch_is_kernel_data(addr)) - return 1; - - /* - * in-kernel percpu var? - */ - if (is_kernel_percpu_address(addr)) - return 1; - - /* - * module static or percpu var? - */ - return is_module_address(addr) || is_module_percpu_address(addr); -} - -/* - * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: - */ -static int count_matching_names(struct lock_class *new_class) -{ - struct lock_class *class; - int count = 0; - - if (!new_class->name) - return 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (new_class->key - new_class->subclass == class->key) - return class->name_version; - if (class->name && !strcmp(class->name, new_class->name)) - count = max(count, class->name_version); - } - - return count + 1; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk(KERN_ERR - "BUG: looking up invalid subclass: %u\n", subclass); - printk(KERN_ERR - "turning off the locking correctness validator.\n"); - dump_stack(); - return NULL; - } - - /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: - */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; - - /* - * NOTE: the class-key must be unique. For dynamic locks, a static - * lock_class_key variable is passed in through the mutex_init() - * (or spin_lock_init()) call - which acts as the key. For static - * locks we use the lock object itself as the key. - */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > - sizeof(struct lockdep_map)); - - key = lock->key->subkeys + subclass; - - hash_head = classhashentry(key); - - /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: - */ - list_for_each_entry(class, hash_head, hash_entry) { - if (class->key == key) { - /* - * Huh! same key, different name? Did someone trample - * on some memory? We're most confused. - */ - WARN_ON_ONCE(class->name != lock->name); - return class; - } - } - - return NULL; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - unsigned long flags; - - class = look_up_lock_class(lock, subclass); - if (likely(class)) - goto out_set_class_cache; - - /* - * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return NULL; - } - - key = lock->key->subkeys + subclass; - hash_head = classhashentry(key); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - /* - * We have to do the hash-walk again, to avoid races - * with another CPU: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - goto out_unlock_set; - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); - return NULL; - } - raw_local_irq_restore(flags); - - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return NULL; - } - class = lock_classes + nr_lock_classes++; - debug_atomic_inc(nr_unused_locks); - class->key = key; - class->name = lock->name; - class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); - class->name_version = count_matching_names(class); - /* - * We use RCU's safe list-add method to make - * parallel walking of the hash-list safe: - */ - list_add_tail_rcu(&class->hash_entry, hash_head); - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); - - if (verbose(class)) { - graph_unlock(); - raw_local_irq_restore(flags); - - printk("\nnew class %p: %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - } -out_unlock_set: - graph_unlock(); - raw_local_irq_restore(flags); - -out_set_class_cache: - if (!subclass || force) - lock->class_cache[0] = class; - else if (subclass < NR_LOCKDEP_CACHING_CLASSES) - lock->class_cache[subclass] = class; - - /* - * Hash collision, did we smoke some? We found a class with a matching - * hash but the subclass -- which is hashed in -- didn't match. - */ - if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) - return NULL; - - return class; -} - -#ifdef CONFIG_PROVE_LOCKING -/* - * Allocate a lockdep entry. (assumes the graph_lock held, returns - * with NULL on failure) - */ -static struct lock_list *alloc_list_entry(void) -{ - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return NULL; - - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return NULL; - } - return list_entries + nr_list_entries++; -} - -/* - * Add a new dependency to the head of the list: - */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) -{ - struct lock_list *entry; - /* - * Lock not present yet - get a new dependency struct and - * add it to the list: - */ - entry = alloc_list_entry(); - if (!entry) - return 0; - - entry->class = this; - entry->distance = distance; - entry->trace = *trace; - /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: - */ - list_add_tail_rcu(&entry->entry, head); - - return 1; -} - -/* - * For good efficiency of modular, we use power of 2 - */ -#define MAX_CIRCULAR_QUEUE_SIZE 4096UL -#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) - -/* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. - */ -struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; - unsigned int front, rear; -}; - -static struct circular_queue lock_cq; - -unsigned int max_bfs_queue_depth; - -static unsigned int lockdep_dependency_gen_id; - -static inline void __cq_init(struct circular_queue *cq) -{ - cq->front = cq->rear = 0; - lockdep_dependency_gen_id++; -} - -static inline int __cq_empty(struct circular_queue *cq) -{ - return (cq->front == cq->rear); -} - -static inline int __cq_full(struct circular_queue *cq) -{ - return ((cq->rear + 1) & CQ_MASK) == cq->front; -} - -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) -{ - if (__cq_full(cq)) - return -1; - - cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1) & CQ_MASK; - return 0; -} - -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) -{ - if (__cq_empty(cq)) - return -1; - - *elem = cq->element[cq->front]; - cq->front = (cq->front + 1) & CQ_MASK; - return 0; -} - -static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) -{ - return (cq->rear - cq->front) & CQ_MASK; -} - -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) -{ - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ - lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; -} - -static inline unsigned long lock_accessed(struct lock_list *lock) -{ - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ - return lock->class->dep_gen_id == lockdep_dependency_gen_id; -} - -static inline struct lock_list *get_lock_parent(struct lock_list *child) -{ - return child->parent; -} - -static inline int get_lock_depth(struct lock_list *child) -{ - int depth = 0; - struct lock_list *parent; - - while ((parent = get_lock_parent(child))) { - child = parent; - depth++; - } - return depth; -} - -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int forward) -{ - struct lock_list *entry; - struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; - - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } - - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - - if (list_empty(head)) - goto exit; - - __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); - - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, (unsigned long *)&lock); - - if (!lock->class) { - ret = -2; - goto exit; - } - - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; - - list_for_each_entry(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } - - if (__cq_enqueue(cq, (unsigned long)entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } - } - } -exit: - return ret; -} - -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) -{ - return __bfs(src_entry, data, match, target_entry, 1); - -} - -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) -{ - return __bfs(src_entry, data, match, target_entry, 0); - -} - -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ - -/* - * Print a dependency chain entry (this is only done when a deadlock - * has been detected): - */ -static noinline int -print_circular_bug_entry(struct lock_list *target, int depth) -{ - if (debug_locks_silent) - return 0; - printk("\n-> #%u", depth); - print_lock_name(target->class); - printk(":\n"); - print_stack_trace(&target->trace, 6); - - return 0; -} - -static void -print_circular_lock_scenario(struct held_lock *src, - struct held_lock *tgt, - struct lock_list *prt) -{ - struct lock_class *source = hlock_class(src); - struct lock_class *target = hlock_class(tgt); - struct lock_class *parent = prt->class; - - /* - * A direct locking problem where unsafe_class lock is taken - * directly by safe_class lock, then all we need to show - * is the deadlock scenario, as it is obvious that the - * unsafe lock is taken under the safe lock. - * - * But if there is a chain instead, where the safe lock takes - * an intermediate lock (middle_class) where this lock is - * not the same as the safe lock, then the lock chain is - * used to describe the problem. Otherwise we would need - * to show a different CPU case for each link in the chain - * from the safe_class lock to the unsafe_class lock. - */ - if (parent != source) { - printk("Chain exists of:\n "); - __print_lock_name(source); - printk(" --> "); - __print_lock_name(parent); - printk(" --> "); - __print_lock_name(target); - printk("\n\n"); - } - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(");\n"); - printk(" lock("); - __print_lock_name(target); - printk(");\n"); - printk(" lock("); - __print_lock_name(source); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -/* - * When a circular dependency is detected, print the - * header first: - */ -static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth, - struct held_lock *check_src, - struct held_lock *check_tgt) -{ - struct task_struct *curr = current; - - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_ident(); - printk("-------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(check_src); - printk("\nbut task is already holding lock:\n"); - print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); - - print_circular_bug_entry(entry, depth); - - return 0; -} - -static inline int class_equal(struct lock_list *entry, void *data) -{ - return entry->class == data; -} - -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) -{ - struct task_struct *curr = current; - struct lock_list *parent; - struct lock_list *first_parent; - int depth; - - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - if (!save_trace(&this->trace)) - return 0; - - depth = get_lock_depth(target); - - print_circular_bug_header(target, depth, check_src, check_tgt); - - parent = get_lock_parent(target); - first_parent = parent; - - while (parent) { - print_circular_bug_entry(parent, --depth); - parent = get_lock_parent(parent); - } - - printk("\nother info that might help us debug this:\n\n"); - print_circular_lock_scenario(check_src, check_tgt, - first_parent); - - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static noinline int print_bfs_bug(int ret) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - /* - * Breadth-first-search failed, graph got corrupted? - */ - WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; -} - -static int noop_count(struct lock_list *entry, void *data) -{ - (*(unsigned long *)data)++; - return 0; -} - -unsigned long __lockdep_count_forward_deps(struct lock_list *this) -{ - unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); - - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); - - return count; -} -unsigned long lockdep_count_forward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - struct lock_list this; - - this.parent = NULL; - this.class = class; - - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -unsigned long __lockdep_count_backward_deps(struct lock_list *this) -{ - unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); - - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); - - return count; -} - -unsigned long lockdep_count_backward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - struct lock_list this; - - this.parent = NULL; - this.class = class; - - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -/* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. - */ -static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_cyclic_checks); - - result = __bfs_forwards(root, target, class_equal, target_entry); - - return result; -} - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) -/* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. - */ - -static inline int usage_match(struct lock_list *entry, void *bit) -{ - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); -} - - - -/* - * Find a node in the forwards-direction dependency sub-graph starting - * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. - */ -static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_find_usage_forwards_checks); - - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); - - return result; -} - -/* - * Find a node in the backwards-direction dependency sub-graph starting - * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. - */ -static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_find_usage_backwards_checks); - - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); - - return result; -} - -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk the shortest lock dependencies from @start to @end in reverse order: - */ -static void __used -print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) -{ - struct lock_list *entry = leaf; - int depth; - - /*compute depth from generated tree by BFS*/ - depth = get_lock_depth(leaf); - - do { - print_lock_class_header(entry->class, depth); - printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); - break; - } - - entry = get_lock_parent(entry); - depth--; - } while (entry && (depth >= 0)); - - return; -} - -static void -print_irq_lock_scenario(struct lock_list *safe_entry, - struct lock_list *unsafe_entry, - struct lock_class *prev_class, - struct lock_class *next_class) -{ - struct lock_class *safe_class = safe_entry->class; - struct lock_class *unsafe_class = unsafe_entry->class; - struct lock_class *middle_class = prev_class; - - if (middle_class == safe_class) - middle_class = next_class; - - /* - * A direct locking problem where unsafe_class lock is taken - * directly by safe_class lock, then all we need to show - * is the deadlock scenario, as it is obvious that the - * unsafe lock is taken under the safe lock. - * - * But if there is a chain instead, where the safe lock takes - * an intermediate lock (middle_class) where this lock is - * not the same as the safe lock, then the lock chain is - * used to describe the problem. Otherwise we would need - * to show a different CPU case for each link in the chain - * from the safe_class lock to the unsafe_class lock. - */ - if (middle_class != unsafe_class) { - printk("Chain exists of:\n "); - __print_lock_name(safe_class); - printk(" --> "); - __print_lock_name(middle_class); - printk(" --> "); - __print_lock_name(unsafe_class); - printk("\n\n"); - } - - printk(" Possible interrupt unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(unsafe_class); - printk(");\n"); - printk(" local_irq_disable();\n"); - printk(" lock("); - __print_lock_name(safe_class); - printk(");\n"); - printk(" lock("); - __print_lock_name(middle_class); - printk(");\n"); - printk(" \n"); - printk(" lock("); - __print_lock_name(safe_class); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -static int -print_bad_irq_dependency(struct task_struct *curr, - struct lock_list *prev_root, - struct lock_list *next_root, - struct lock_list *backwards_entry, - struct lock_list *forwards_entry, - struct held_lock *prev, - struct held_lock *next, - enum lock_usage_bit bit1, - enum lock_usage_bit bit2, - const char *irqclass) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", - irqclass, irqclass); - print_kernel_ident(); - printk("------------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", - curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, - curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, - curr->softirqs_enabled); - print_lock(next); - - printk("\nand this task is already holding:\n"); - print_lock(prev); - printk("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); - printk(" ->"); - print_lock_name(hlock_class(next)); - printk("\n"); - - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", - irqclass); - print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); - - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - - printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); - - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - - printk("\nother info that might help us debug this:\n\n"); - print_irq_lock_scenario(backwards_entry, forwards_entry, - hlock_class(prev), hlock_class(next)); - - lockdep_print_held_locks(curr); - - printk("\nthe dependencies between %s-irq-safe lock", irqclass); - printk(" and the holding lock:\n"); - if (!save_trace(&prev_root->trace)) - return 0; - print_shortest_lock_dependencies(backwards_entry, prev_root); - - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) - return 0; - print_shortest_lock_dependencies(forwards_entry, next_root); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); -} - -static const char *state_names[] = { -#define LOCKDEP_STATE(__STATE) \ - __stringify(__STATE), -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static const char *state_rnames[] = { -#define LOCKDEP_STATE(__STATE) \ - __stringify(__STATE)"-READ", -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static inline const char *state_name(enum lock_usage_bit bit) -{ - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; -} - -static int exclusive_bit(int new_bit) -{ - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; - - /* - * keep state, bit flip the direction and strip read. - */ - return state | (dir ^ 2); -} - -static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) -{ - /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; - - bit++; /* _READ */ - - /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; - - return 1; -} - -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE - - return 1; -} - -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - -#else - -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - return 1; -} - -static inline void inc_chains(void) -{ - nr_process_chains++; -} - -#endif - -static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) -{ - struct lock_class *next = hlock_class(nxt); - struct lock_class *prev = hlock_class(prv); - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0\n"); - printk(" ----\n"); - printk(" lock("); - __print_lock_name(prev); - printk(");\n"); - printk(" lock("); - __print_lock_name(next); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); - printk(" May be due to missing lock nesting notation\n\n"); -} - -static int -print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); - print_kernel_ident(); - printk("---------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(next); - printk("\nbut task is already holding lock:\n"); - print_lock(prev); - - printk("\nother info that might help us debug this:\n"); - print_deadlock_scenario(next, prev); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Check whether we are holding such a class already. - * - * (Note that this has to be done separately, because the graph cannot - * detect such classes of deadlocks.) - * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read - */ -static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) -{ - struct held_lock *prev; - struct held_lock *nest = NULL; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - prev = curr->held_locks + i; - - if (prev->instance == next->nest_lock) - nest = prev; - - if (hlock_class(prev) != hlock_class(next)) - continue; - - /* - * Allow read-after-read recursion of the same - * lock class (i.e. read_lock(lock)+read_lock(lock)): - */ - if ((read == 2) && prev->read) - return 2; - - /* - * We're holding the nest_lock, which serializes this lock's - * nesting behaviour. - */ - if (nest) - return 2; - - return print_deadlock_bug(curr, prev, next); - } - return 1; -} - -/* - * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: - * - * - would the adding of the -> dependency create a - * circular dependency in the graph? [== circular deadlock] - * - * - does the new prev->next dependency connect any hardirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * hardirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with hardirq contexts] - * - * - does the new prev->next dependency connect any softirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * softirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with softirq contexts] - * - * any of these scenarios could lead to a deadlock. - * - * Then if all the validations pass, we add the forwards and backwards - * dependency. - */ -static int -check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) -{ - struct lock_list *entry; - int ret; - struct lock_list this; - struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; - - /* - * Prove that the new -> dependency would not - * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at , and - * checking whether we can reach .) - * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: - */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); - - if (!check_prev_add_irq(curr, prev, next)) - return 0; - - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* - * Is the -> dependency already present? - * - * (this may occur even though this is a new chain: consider - * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 - * chains - the second one will be new, but L1 already has - * L2 added to its dependency list, due to the first chain.) - */ - list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { - if (entry->class == hlock_class(next)) { - if (distance == 1) - entry->distance = 1; - return 2; - } - } - - if (!trylock_loop && !save_trace(&trace)) - return 0; - - /* - * Ok, all validations passed, add the new lock - * to the previous lock's dependency list: - */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); - - if (!ret) - return 0; - - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); - if (!ret) - return 0; - - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(" => "); - print_lock_name(hlock_class(next)); - printk("\n"); - dump_stack(); - return graph_lock(); - } - return 1; -} - -/* - * Add the dependency to all directly-previous locks that are 'relevant'. - * The ones that are relevant are (in increasing distance from curr): - * all consecutive trylock entries and the final non-trylock entry - or - * the end of this context's lock-chain - whichever comes first. - */ -static int -check_prevs_add(struct task_struct *curr, struct held_lock *next) -{ - int depth = curr->lockdep_depth; - int trylock_loop = 0; - struct held_lock *hlock; - - /* - * Debugging checks. - * - * Depth must not be zero for a non-head lock: - */ - if (!depth) - goto out_bug; - /* - * At least two relevant locks must exist for this - * to be a head: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - goto out_bug; - - for (;;) { - int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) - return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } - depth--; - /* - * End of lock-stack? - */ - if (!depth) - break; - /* - * Stop the search if we cross into another context: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - break; - trylock_loop = 1; - } - return 1; -out_bug: - if (!debug_locks_off_graph_unlock()) - return 0; - - /* - * Clearly we all shouldn't be here, but since we made it we - * can reliable say we messed up our state. See the above two - * gotos for reasons why we could possibly end up here. - */ - WARN_ON(1); - - return 0; -} - -unsigned long nr_lock_chains; -struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -int nr_chain_hlocks; -static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; - -struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) -{ - return lock_classes + chain_hlocks[chain->base + i]; -} - -/* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) - */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) -{ - struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); - struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; - int i, j; - - /* - * We might need to take the graph lock, ensure we've got IRQs - * disabled to make this an IRQ-safe lock.. for recursion reasons - * lockdep won't complain about its own locking errors. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); - return 0; - } - } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - if (!graph_lock()) - return 0; - /* - * We have to walk the chain again locked - to avoid duplicates: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { - if (!debug_locks_off_graph_unlock()) - return 0; - - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - chain = lock_chains + nr_lock_chains++; - chain->chain_key = chain_key; - chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - hlock_next = hlock; - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) - break; - hlock_next = hlock; - } - i++; - chain->depth = curr->lockdep_depth + 1 - i; - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - } - list_add_tail_rcu(&chain->entry, hash_head); - debug_atomic_inc(chain_lookup_misses); - inc_chains(); - - return 1; -} - -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) -{ - /* - * Trylock needs to maintain the stack of held locks, but it - * does not add new dependencies, because trylock can be done - * in any order. - * - * We look up the chain_key and do the O(N^2) check and update of - * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires - * graph_lock for us) - */ - if (!hlock->trylock && (hlock->check == 2) && - lookup_chain_cache(curr, hlock, chain_key)) { - /* - * Check whether last held lock: - * - * - is irq-safe, if this lock is irq-unsafe - * - is softirq-safe, if this lock is hardirq-unsafe - * - * And check whether the new lock's dependency graph - * could lead back to the previous lock. - * - * any of these scenarios could lead to a deadlock. If - * All validations - */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); - - if (!ret) - return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* - * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: - */ - if (!chain_head && ret != 2) - if (!check_prevs_add(curr, hlock)) - return 0; - graph_unlock(); - } else - /* after lookup_chain_cache(): */ - if (unlikely(!debug_locks)) - return 0; - - return 1; -} -#else -static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) -{ - return 1; -} -#endif - -/* - * We are building curr_chain_key incrementally, so double-check - * it from scratch, to make sure that it's done correctly: - */ -static void check_chain_key(struct task_struct *curr) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; - u64 chain_key = 0; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - if (chain_key != hlock->prev_chain_key) { - debug_locks_off(); - /* - * We got mighty confused, our chain keys don't match - * with what we expect, someone trample on our task state? - */ - WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)hlock->prev_chain_key); - return; - } - id = hlock->class_idx - 1; - /* - * Whoops ran out of static storage again? - */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return; - - if (prev_hlock && (prev_hlock->irq_context != - hlock->irq_context)) - chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); - prev_hlock = hlock; - } - if (chain_key != curr->curr_chain_key) { - debug_locks_off(); - /* - * More smoking hash instead of calculating it, damn see these - * numbers float.. I bet that a pink elephant stepped on my memory. - */ - WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)curr->curr_chain_key); - } -#endif -} - -static void -print_usage_bug_scenario(struct held_lock *lock) -{ - struct lock_class *class = hlock_class(lock); - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0\n"); - printk(" ----\n"); - printk(" lock("); - __print_lock_name(class); - printk(");\n"); - printk(" \n"); - printk(" lock("); - __print_lock_name(class); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -static int -print_usage_bug(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); - print_kernel_ident(); - printk("---------------------------------\n"); - - printk("inconsistent {%s} -> {%s} usage.\n", - usage_str[prev_bit], usage_str[new_bit]); - - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); - print_lock(this); - - printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); - - print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); - print_usage_bug_scenario(this); - - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Print out an error if an invalid bit is set: - */ -static inline int -valid_state(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) -{ - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); - return 1; -} - -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - -/* - * print irq inversion bug: - */ -static int -print_irq_inversion_bug(struct task_struct *curr, - struct lock_list *root, struct lock_list *other, - struct held_lock *this, int forwards, - const char *irqclass) -{ - struct lock_list *entry = other; - struct lock_list *middle = NULL; - int depth; - - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_ident(); - printk("---------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(this); - if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); - else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - - printk("\nother info that might help us debug this:\n"); - - /* Find a middle lock (if one exists) */ - depth = get_lock_depth(other); - do { - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); - break; - } - middle = entry; - entry = get_lock_parent(entry); - depth--; - } while (entry && entry != root && (depth >= 0)); - if (forwards) - print_irq_lock_scenario(root, other, - middle ? middle->class : root->class, other->class); - else - print_irq_lock_scenario(other, root, - middle ? middle->class : other->class, root->class); - - lockdep_print_held_locks(curr); - - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) - return 0; - print_shortest_lock_dependencies(other, root); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Prove that in the forwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - struct lock_list root; - struct lock_list *uninitialized_var(target_entry); - - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); -} - -/* - * Prove that in the backwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - struct lock_list root; - struct lock_list *uninitialized_var(target_entry); - - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); -} - -void print_irqtrace_events(struct task_struct *curr) -{ - printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); -} - -static int HARDIRQ_verbose(struct lock_class *class) -{ -#if HARDIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int SOFTIRQ_verbose(struct lock_class *class) -{ -#if SOFTIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#define STRICT_READ_CHECKS 1 - -static int (*state_verbose_f[])(struct lock_class *class) = { -#define LOCKDEP_STATE(__STATE) \ - __STATE##_verbose, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static inline int state_verbose(enum lock_usage_bit bit, - struct lock_class *class) -{ - return state_verbose_f[bit >> 2](class); -} - -typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, - enum lock_usage_bit bit, const char *name); - -static int -mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; - - /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - - /* - * Validate that this particular lock does not have conflicting - * usage states. - */ - if (!valid_state(curr, this, new_bit, excl_bit)) - return 0; - - /* - * Validate that the lock dependencies don't have conflicting usage - * states. - */ - if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) - return 0; - - /* - * Check for read in write conflicts - */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) - return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) - return 0; - } - - if (state_verbose(new_bit, hlock_class(this))) - return 2; - - return 1; -} - -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -/* - * Mark all held locks with a usage bit: - */ -static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) -{ - enum lock_usage_bit usage_bit; - struct held_lock *hlock; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - usage_bit = 2 + (mark << 2); /* ENABLED */ - if (hlock->read) - usage_bit += 1; /* READ */ - - BUG_ON(usage_bit >= LOCK_USAGE_STATES); - - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) - continue; - - if (!mark_lock(curr, hlock, usage_bit)) - return 0; - } - - return 1; -} - -/* - * Hardirqs will be enabled: - */ -static void __trace_hardirqs_on_caller(unsigned long ip) -{ - struct task_struct *curr = current; - - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - - /* - * We are going to turn hardirqs on, so set the - * usage bit for all held locks: - */ - if (!mark_held_locks(curr, HARDIRQ)) - return; - /* - * If we have softirqs enabled, then set the usage - * bit for all held locks. (disabled hardirqs prevented - * this bit from being set before) - */ - if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); -} - -void trace_hardirqs_on_caller(unsigned long ip) -{ - time_hardirqs_on(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (unlikely(current->hardirqs_enabled)) { - /* - * Neither irq nor preemption are disabled here - * so this is racy by nature but losing one hit - * in a stat is not a big deal. - */ - __debug_atomic_inc(redundant_hardirqs_on); - return; - } - - /* - * We're enabling irqs and according to our state above irqs weren't - * already enabled, yet we find the hardware thinks they are in fact - * enabled.. someone messed up their IRQ state tracing. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - /* - * See the fine text that goes along with this variable definition. - */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) - return; - - /* - * Can't allow enabling interrupts while in an interrupt handler, - * that's general bad form and such. Recursion, limited stack etc.. - */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) - return; - - current->lockdep_recursion = 1; - __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -/* - * Hardirqs were disabled: - */ -void trace_hardirqs_off_caller(unsigned long ip) -{ - struct task_struct *curr = current; - - time_hardirqs_off(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * So we're supposed to get called after you mask local IRQs, but for - * some reason the hardware doesn't quite think you did a proper job. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->hardirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_off_events); - } else - debug_atomic_inc(redundant_hardirqs_off); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -/* - * Softirqs will be enabled: - */ -void trace_softirqs_on(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * We fancy IRQs being disabled here, see softirq.c, avoids - * funny state and nesting things. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - debug_atomic_inc(redundant_softirqs_on); - return; - } - - current->lockdep_recursion = 1; - /* - * We'll do an OFF -> ON transition: - */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; - debug_atomic_inc(softirqs_on_events); - /* - * We are going to turn softirqs on, so set the - * usage bit for all held locks, if hardirqs are - * enabled too: - */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); - current->lockdep_recursion = 0; -} - -/* - * Softirqs were disabled: - */ -void trace_softirqs_off(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * We fancy IRQs being disabled here, see softirq.c - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(softirqs_off_events); - /* - * Whoops, we wanted softirqs off, so why aren't they? - */ - DEBUG_LOCKS_WARN_ON(!softirq_count()); - } else - debug_atomic_inc(redundant_softirqs_off); -} - -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) -{ - /* - * If non-trylock use in a hardirq or softirq context, then - * mark the lock as used in these contexts: - */ - if (!hlock->trylock) { - if (hlock->read) { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - } else { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) - return 0; - } - } - if (!hlock->hardirqs_off) { - if (hlock->read) { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQ_READ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQ_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQ)) - return 0; - } - } - - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - - return 1; -} - -static int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - unsigned int depth = curr->lockdep_depth; - - /* - * Keep track of points where we cross into an interrupt context: - */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; - if (depth) { - struct held_lock *prev_hlock; - - prev_hlock = curr->held_locks + depth-1; - /* - * If we cross into another context, reset the - * hash key (this also prevents the checking and the - * adding of the dependency to 'prev'): - */ - if (prev_hlock->irq_context != hlock->irq_context) - return 1; - } - return 0; -} - -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -/* - * Mark a lock with a usage bit, and validate the state transition: - */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - unsigned int new_mask = 1 << new_bit, ret = 1; - - /* - * If already set then do not dirty the cacheline, - * nor do any checks: - */ - if (likely(hlock_class(this)->usage_mask & new_mask)) - return 1; - - if (!graph_lock()) - return 0; - /* - * Make sure we didn't race: - */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } - - hlock_class(this)->usage_mask |= new_mask; - - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) - return 0; - - switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: - if (!debug_locks_off_graph_unlock()) - return 0; - WARN_ON(1); - return 0; - } - - graph_unlock(); - - /* - * We must printk outside of the graph_lock: - */ - if (ret == 2) { - printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); - print_lock(this); - print_irqtrace_events(curr); - dump_stack(); - } - - return ret; -} - -/* - * Initialize a lock instance's lock-class mapping info: - */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - int i; - - kmemcheck_mark_initialized(lock, sizeof(*lock)); - - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) - lock->class_cache[i] = NULL; - -#ifdef CONFIG_LOCK_STAT - lock->cpu = raw_smp_processor_id(); -#endif - - /* - * Can't be having no nameless bastards around this place! - */ - if (DEBUG_LOCKS_WARN_ON(!name)) { - lock->name = "NULL"; - return; - } - - lock->name = name; - - /* - * No key, no joy, we need to hash something. - */ - if (DEBUG_LOCKS_WARN_ON(!key)) - return; - /* - * Sanity check, the lock-class key must be persistent: - */ - if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ - DEBUG_LOCKS_WARN_ON(1); - return; - } - lock->key = key; - - if (unlikely(!debug_locks)) - return; - - if (subclass) - register_lock_class(lock, subclass, 1); -} -EXPORT_SYMBOL_GPL(lockdep_init_map); - -struct lock_class_key __lockdep_no_validate__; - -/* - * This gets called for every mutex_lock*()/spin_lock*() operation. - * We maintain the dependency maps and validate the locking attempt: - */ -static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip, - int references) -{ - struct task_struct *curr = current; - struct lock_class *class = NULL; - struct held_lock *hlock; - unsigned int depth, id; - int chain_head = 0; - int class_idx; - u64 chain_key; - - if (!prove_locking) - check = 1; - - if (unlikely(!debug_locks)) - return 0; - - /* - * Lockdep should run with IRQs disabled, otherwise we could - * get an interrupt which would want to take locks, which would - * end up in lockdep and have you got a head-ache already? - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (lock->key == &__lockdep_no_validate__) - check = 1; - - if (subclass < NR_LOCKDEP_CACHING_CLASSES) - class = lock->class_cache[subclass]; - /* - * Not cached? - */ - if (unlikely(!class)) { - class = register_lock_class(lock, subclass, 0); - if (!class) - return 0; - } - atomic_inc((atomic_t *)&class->ops); - if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - } - - /* - * Add the lock to the list of currently held locks. - * (we dont increase the depth just yet, up until the - * dependency checks are done) - */ - depth = curr->lockdep_depth; - /* - * Ran out of static storage for our per-task lock stack again have we? - */ - if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) - return 0; - - class_idx = class - lock_classes + 1; - - if (depth) { - hlock = curr->held_locks + depth - 1; - if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) - hlock->references++; - else - hlock->references = 2; - - return 1; - } - } - - hlock = curr->held_locks + depth; - /* - * Plain impossible, we just registered it and checked it weren't no - * NULL like.. I bet this mushroom I ate was good! - */ - if (DEBUG_LOCKS_WARN_ON(!class)) - return 0; - hlock->class_idx = class_idx; - hlock->acquire_ip = ip; - hlock->instance = lock; - hlock->nest_lock = nest_lock; - hlock->trylock = trylock; - hlock->read = read; - hlock->check = check; - hlock->hardirqs_off = !!hardirqs_off; - hlock->references = references; -#ifdef CONFIG_LOCK_STAT - hlock->waittime_stamp = 0; - hlock->holdtime_stamp = lockstat_clock(); -#endif - - if (check == 2 && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) - return 0; - - /* - * Calculate the chain hash: it's the combined hash of all the - * lock keys along the dependency chain. We save the hash value - * at every step so that we can get the current hash easily - * after unlock. The chain hash is then used to cache dependency - * results. - * - * The 'key ID' is what is the most compact key value to drive - * the hash, not class->key. - */ - id = class - lock_classes; - /* - * Whoops, we did it again.. ran straight out of our static allocation. - */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return 0; - - chain_key = curr->curr_chain_key; - if (!depth) { - /* - * How can we have a chain hash when we ain't got no keys?! - */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) - return 0; - chain_head = 1; - } - - hlock->prev_chain_key = chain_key; - if (separate_irq_context(curr, hlock)) { - chain_key = 0; - chain_head = 1; - } - chain_key = iterate_chain_key(chain_key, id); - - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) - return 0; - - curr->curr_chain_key = chain_key; - curr->lockdep_depth++; - check_chain_key(curr); -#ifdef CONFIG_DEBUG_LOCKDEP - if (unlikely(!debug_locks)) - return 0; -#endif - if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { - debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - - if (unlikely(curr->lockdep_depth > max_lockdep_depth)) - max_lockdep_depth = curr->lockdep_depth; - - return 1; -} - -static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); - print_kernel_ident(); - printk("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Common debugging checks for both nested and non-nested unlock: - */ -static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (unlikely(!debug_locks)) - return 0; - /* - * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); - - return 1; -} - -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) -{ - if (hlock->instance == lock) - return 1; - - if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; - - if (!class) - class = look_up_lock_class(lock, 0); - - /* - * If look_up_lock_class() failed to find a class, we're trying - * to test if we hold a lock that has never yet been acquired. - * Clearly if the lock hasn't been acquired _ever_, we're not - * holding it either, so report failure. - */ - if (!class) - return 0; - - /* - * References, but not a lock we're actually ref-counting? - * State got messed up, follow the sites that change ->references - * and try to make sense of it. - */ - if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) - return 0; - - if (hlock->class_idx == class - lock_classes + 1) - return 1; - } - - return 0; -} - -static int -__lock_set_class(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, unsigned int subclass, - unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class *class; - unsigned int depth; - int i; - - depth = curr->lockdep_depth; - /* - * This function is about (re)setting the class of a held lock, - * yet we're not actually holding any locks. Naughty user! - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_inbalance_bug(curr, lock, ip); - -found_it: - lockdep_init_map(lock, name, key, 0); - class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; - - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references)) - return 0; - } - - /* - * I took it apart and put it back together again, except now I have - * these 'spare' parts.. where shall I put them. - */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks in a - * potentially non-nested (out of order) manner. This is a - * relatively rare operation, as all the unlock APIs default - * to nested mode (which uses lock_release()): - */ -static int -lock_release_non_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock, *prev_hlock; - unsigned int depth; - int i; - - /* - * Check whether the lock exists in the current stack - * of held locks: - */ - depth = curr->lockdep_depth; - /* - * So we're all set to release this lock.. wait what lock? We don't - * own any locks, you've been drinking again? - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_inbalance_bug(curr, lock, ip); - -found_it: - if (hlock->instance == lock) - lock_release_holdtime(hlock); - - if (hlock->references) { - hlock->references--; - if (hlock->references) { - /* - * We had, and after removing one, still have - * references, the current lock stack is still - * valid. We're done! - */ - return 1; - } - } - - /* - * We have the right lock to unlock, 'hlock' points to it. - * Now we remove it from the stack, and add back the other - * entries (if any), recalculating the hash along the way: - */ - - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references)) - return 0; - } - - /* - * We had N bottles of beer on the wall, we drank one, but now - * there's not N-1 bottles of beer left on the wall... - */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static int lock_release_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock; - unsigned int depth; - - /* - * Pop off the top of the lock stack: - */ - depth = curr->lockdep_depth - 1; - hlock = curr->held_locks + depth; - - /* - * Is the unlock non-nested: - */ - if (hlock->instance != lock || hlock->references) - return lock_release_non_nested(curr, lock, ip); - curr->lockdep_depth--; - - /* - * No more locks, but somehow we've got hash left over, who left it? - */ - if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) - return 0; - - curr->curr_chain_key = hlock->prev_chain_key; - - lock_release_holdtime(hlock); - -#ifdef CONFIG_DEBUG_LOCKDEP - hlock->prev_chain_key = 0; - hlock->class_idx = 0; - hlock->acquire_ip = 0; - hlock->irq_context = 0; -#endif - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static void -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) -{ - struct task_struct *curr = current; - - if (!check_unlock(curr, lock, ip)) - return; - - if (nested) { - if (!lock_release_nested(curr, lock, ip)) - return; - } else { - if (!lock_release_non_nested(curr, lock, ip)) - return; - } - - check_chain_key(curr); -} - -static int __lock_is_held(struct lockdep_map *lock) -{ - struct task_struct *curr = current; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - struct held_lock *hlock = curr->held_locks + i; - - if (match_held_lock(hlock, lock)) - return 1; - } - - return 0; -} - -/* - * Check whether we follow the irq-flags state precisely: - */ -static void check_flags(unsigned long flags) -{ -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) - if (!debug_locks) - return; - - if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-off.\n"); - } - } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-on.\n"); - } - } - - /* - * We dont accurately track softirq state in e.g. - * hardirq contexts (such as on 4KSTACKS), so only - * check if not in hardirq contexts: - */ - if (!hardirq_count()) { - if (softirq_count()) { - /* like the above, but with softirqs */ - DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - } else { - /* lick the above, does it taste good? */ - DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); - } - } - - if (!debug_locks) - print_irqtrace_events(current); -#endif -} - -void lock_set_class(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, unsigned int subclass, - unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - current->lockdep_recursion = 1; - check_flags(flags); - if (__lock_set_class(lock, name, key, subclass, ip)) - check_chain_key(current); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_set_class); - -/* - * We are not always called with irqs disabled - do that here, - * and also avoid lockdep recursion: - */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *nest_lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - - current->lockdep_recursion = 1; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_acquire); - -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_release(lock, ip); - __lock_release(lock, nested, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_release); - -int lock_is_held(struct lockdep_map *lock) -{ - unsigned long flags; - int ret = 0; - - if (unlikely(current->lockdep_recursion)) - return 1; /* avoid false negative lockdep_assert_held() */ - - raw_local_irq_save(flags); - check_flags(flags); - - current->lockdep_recursion = 1; - ret = __lock_is_held(lock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_GPL(lock_is_held); - -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = gfp_mask; -} - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} - -#ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); - print_kernel_ident(); - printk("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static void -__lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - int i, contention_point, contending_point; - - depth = curr->lockdep_depth; - /* - * Whee, we contended on this lock, except it seems we're not - * actually trying to acquire anything much at all.. - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, ip); - return; - -found_it: - if (hlock->instance != lock) - return; - - hlock->waittime_stamp = lockstat_clock(); - - contention_point = lock_point(hlock_class(hlock)->contention_point, ip); - contending_point = lock_point(hlock_class(hlock)->contending_point, - lock->ip); - - stats = get_lock_stats(hlock_class(hlock)); - if (contention_point < LOCKSTAT_POINTS) - stats->contention_point[contention_point]++; - if (contending_point < LOCKSTAT_POINTS) - stats->contending_point[contending_point]++; - if (lock->cpu != smp_processor_id()) - stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); -} - -static void -__lock_acquired(struct lockdep_map *lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - u64 now, waittime = 0; - int i, cpu; - - depth = curr->lockdep_depth; - /* - * Yay, we acquired ownership of this lock we didn't try to - * acquire, how the heck did that happen? - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; - -found_it: - if (hlock->instance != lock) - return; - - cpu = smp_processor_id(); - if (hlock->waittime_stamp) { - now = lockstat_clock(); - waittime = now - hlock->waittime_stamp; - hlock->holdtime_stamp = now; - } - - trace_lock_acquired(lock, ip); - - stats = get_lock_stats(hlock_class(hlock)); - if (waittime) { - if (hlock->read) - lock_time_inc(&stats->read_waittime, waittime); - else - lock_time_inc(&stats->write_waittime, waittime); - } - if (lock->cpu != cpu) - stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); - - lock->cpu = cpu; - lock->ip = ip; -} - -void lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_contended(lock, ip); - __lock_contended(lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_contended); - -void lock_acquired(struct lockdep_map *lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lock_acquired(lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_acquired); -#endif - -/* - * Used by the testsuite, sanitize the validator state - * after a simulated failure: - */ - -void lockdep_reset(void) -{ - unsigned long flags; - int i; - - raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; - memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); - nr_hardirq_chains = 0; - nr_softirq_chains = 0; - nr_process_chains = 0; - debug_locks = 1; - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - raw_local_irq_restore(flags); -} - -static void zap_class(struct lock_class *class) -{ - int i; - - /* - * Remove all dependencies this lock is - * involved in: - */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); - } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - list_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); - - class->key = NULL; -} - -static inline int within(const void *addr, void *start, unsigned long size) -{ - return addr >= start && addr < start + size; -} - -void lockdep_free_key_range(void *start, unsigned long size) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i; - int locked; - - raw_local_irq_save(flags); - locked = graph_lock(); - - /* - * Unhash all classes that were created by this module: - */ - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); - } - } - - if (locked) - graph_unlock(); - raw_local_irq_restore(flags); -} - -void lockdep_reset_lock(struct lockdep_map *lock) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i, j; - int locked; - - raw_local_irq_save(flags); - - /* - * Remove all classes this lock might have: - */ - for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { - /* - * If the class exists we look it up and zap it: - */ - class = look_up_lock_class(lock, j); - if (class) - zap_class(class); - } - /* - * Debug check: in the end all mapped classes should - * be gone. - */ - locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - int match = 0; - - for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) - match |= class == lock->class_cache[j]; - - if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); - } - goto out_restore; - } - } - } - if (locked) - graph_unlock(); - -out_restore: - raw_local_irq_restore(flags); -} - -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - -void __init lockdep_info(void) -{ - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE -#ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) -#endif - ) / 1024 - ); - - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif -} - -static void -print_freed_lock_bug(struct task_struct *curr, const void *mem_from, - const void *mem_to, struct held_lock *hlock) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); - print_kernel_ident(); - printk("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", - curr->comm, task_pid_nr(curr), mem_from, mem_to-1); - print_lock(hlock); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); -} - -static inline int not_in_range(const void* mem_from, unsigned long mem_len, - const void* lock_from, unsigned long lock_len) -{ - return lock_from + lock_len <= mem_from || - mem_from + mem_len <= lock_from; -} - -/* - * Called when kernel memory is freed (or unmapped), or if a lock - * is destroyed or reinitialized - this code checks whether there is - * any held lock in the memory range of to : - */ -void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) -{ - struct task_struct *curr = current; - struct held_lock *hlock; - unsigned long flags; - int i; - - if (unlikely(!debug_locks)) - return; - - local_irq_save(flags); - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - if (not_in_range(mem_from, mem_len, hlock->instance, - sizeof(*hlock->instance))) - continue; - - print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); - break; - } - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); - -static void print_held_locks_bug(struct task_struct *curr) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); - print_kernel_ident(); - printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); -} - -void debug_check_no_locks_held(struct task_struct *task) -{ - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); -} - -void debug_show_all_locks(void) -{ - struct task_struct *g, *p; - int count = 10; - int unlock = 1; - - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - printk("\nShowing all locks held in the system:\n"); - - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - printk(" #%d", 10-count); - mdelay(200); - goto retry; - } - printk(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - printk(KERN_CONT " locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) - continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - printk("\n"); - printk("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); -} -EXPORT_SYMBOL_GPL(debug_show_all_locks); - -/* - * Careful: only use this function if you are sure that - * the task cannot run in parallel! - */ -void debug_show_held_locks(struct task_struct *task) -{ - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - lockdep_print_held_locks(task); -} -EXPORT_SYMBOL_GPL(debug_show_held_locks); - -void lockdep_sys_exit(void) -{ - struct task_struct *curr = current; - - if (unlikely(curr->lockdep_depth)) { - if (!debug_locks_off()) - return; - printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); - print_kernel_ident(); - printk("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", - curr->comm, curr->pid); - lockdep_print_held_locks(curr); - } -} - -void lockdep_rcu_suspicious(const char *file, const int line, const char *s) -{ - struct task_struct *curr = current; - -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ - /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); - printk("===============================\n"); - printk("[ INFO: suspicious RCU usage. ]\n"); - print_kernel_ident(); - printk("-------------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); - - /* - * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU - * considers that CPU to be in an "extended quiescent state", - * which means that RCU will be completely ignoring that CPU. - * Therefore, rcu_read_lock() and friends have absolutely no - * effect on a CPU running in that state. In other words, even if - * such an RCU-idle CPU has called rcu_read_lock(), RCU might well - * delete data structures out from under it. RCU really has no - * choice here: we need to keep an RCU-free window in idle where - * the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run - * in the idle task. - * - * So complain bitterly if someone does call rcu_read_lock(), - * rcu_read_lock_bh() and so on from extended quiescent states. - */ - if (rcu_is_cpu_idle()) - printk("RCU used illegally from extended quiescent state!\n"); - - lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); - dump_stack(); -} -EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); -/* - * kernel/lockdep_proc.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Code for /proc/lockdep and /proc/lockdep_stats: - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lockdep_internals.h" - -static void *l_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &all_lock_classes, pos); -} - -static void *l_start(struct seq_file *m, loff_t *pos) -{ - return seq_list_start_head(&all_lock_classes, *pos); -} - -static void l_stop(struct seq_file *m, void *v) -{ -} - -static void print_name(struct seq_file *m, struct lock_class *class) -{ - char str[128]; - const char *name = class->name; - - if (!name) { - name = __get_key_name(class->key, str); - seq_printf(m, "%s", name); - } else{ - seq_printf(m, "%s", name); - if (class->name_version > 1) - seq_printf(m, "#%d", class->name_version); - if (class->subclass) - seq_printf(m, "/%d", class->subclass); - } -} - -static int l_show(struct seq_file *m, void *v) -{ - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); - struct lock_list *entry; - char usage[LOCK_USAGE_CHARS]; - - if (v == &all_lock_classes) { - seq_printf(m, "all lock classes:\n"); - return 0; - } - - seq_printf(m, "%p", class->key); -#ifdef CONFIG_DEBUG_LOCKDEP - seq_printf(m, " OPS:%8ld", class->ops); -#endif -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); - seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); -#endif - - get_usage_chars(class, usage); - seq_printf(m, " %s", usage); - - seq_printf(m, ": "); - print_name(m, class); - seq_puts(m, "\n"); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class->key); - print_name(m, entry->class); - seq_puts(m, "\n"); - } - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_ops = { - .start = l_start, - .next = l_next, - .stop = l_stop, - .show = l_show, -}; - -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_PROVE_LOCKING -static void *lc_start(struct seq_file *m, loff_t *pos) -{ - if (*pos == 0) - return SEQ_START_TOKEN; - - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; -} - -static void *lc_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return lc_start(m, pos); -} - -static void lc_stop(struct seq_file *m, void *v) -{ -} - -static int lc_show(struct seq_file *m, void *v) -{ - struct lock_chain *chain = v; - struct lock_class *class; - int i; - - if (v == SEQ_START_TOKEN) { - seq_printf(m, "all lock chains:\n"); - return 0; - } - - seq_printf(m, "irq_context: %d\n", chain->irq_context); - - for (i = 0; i < chain->depth; i++) { - class = lock_chain_get_class(chain, i); - if (!class->key) - continue; - - seq_printf(m, "[%p] ", class->key); - print_name(m, class); - seq_puts(m, "\n"); - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_chains_ops = { - .start = lc_start, - .next = lc_next, - .stop = lc_stop, - .show = lc_show, -}; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* CONFIG_PROVE_LOCKING */ - -static void lockdep_stats_debug_show(struct seq_file *m) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), - hi2 = debug_atomic_read(hardirqs_off_events), - hr1 = debug_atomic_read(redundant_hardirqs_on), - hr2 = debug_atomic_read(redundant_hardirqs_off), - si1 = debug_atomic_read(softirqs_on_events), - si2 = debug_atomic_read(softirqs_off_events), - sr1 = debug_atomic_read(redundant_softirqs_on), - sr2 = debug_atomic_read(redundant_softirqs_off); - - seq_printf(m, " chain lookup misses: %11llu\n", - debug_atomic_read(chain_lookup_misses)); - seq_printf(m, " chain lookup hits: %11llu\n", - debug_atomic_read(chain_lookup_hits)); - seq_printf(m, " cyclic checks: %11llu\n", - debug_atomic_read(nr_cyclic_checks)); - seq_printf(m, " find-mask forwards checks: %11llu\n", - debug_atomic_read(nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask backwards checks: %11llu\n", - debug_atomic_read(nr_find_usage_backwards_checks)); - - seq_printf(m, " hardirq on events: %11llu\n", hi1); - seq_printf(m, " hardirq off events: %11llu\n", hi2); - seq_printf(m, " redundant hardirq ons: %11llu\n", hr1); - seq_printf(m, " redundant hardirq offs: %11llu\n", hr2); - seq_printf(m, " softirq on events: %11llu\n", si1); - seq_printf(m, " softirq off events: %11llu\n", si2); - seq_printf(m, " redundant softirq ons: %11llu\n", sr1); - seq_printf(m, " redundant softirq offs: %11llu\n", sr2); -#endif -} - -static int lockdep_stats_show(struct seq_file *m, void *v) -{ - struct lock_class *class; - unsigned long nr_unused = 0, nr_uncategorized = 0, - nr_irq_safe = 0, nr_irq_unsafe = 0, - nr_softirq_safe = 0, nr_softirq_unsafe = 0, - nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, - nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, - nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, - nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - - if (class->usage_mask == 0) - nr_unused++; - if (class->usage_mask == LOCKF_USED) - nr_uncategorized++; - if (class->usage_mask & LOCKF_USED_IN_IRQ) - nr_irq_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQ) - nr_irq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) - nr_softirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) - nr_softirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) - nr_hardirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) - nr_hardirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) - nr_irq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) - nr_irq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) - nr_softirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) - nr_softirq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) - nr_hardirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) - nr_hardirq_read_unsafe++; - -#ifdef CONFIG_PROVE_LOCKING - sum_forward_deps += lockdep_count_forward_deps(class); -#endif - } -#ifdef CONFIG_DEBUG_LOCKDEP - DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); -#endif - seq_printf(m, " lock-classes: %11lu [max: %lu]\n", - nr_lock_classes, MAX_LOCKDEP_KEYS); - seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", - nr_list_entries, MAX_LOCKDEP_ENTRIES); - seq_printf(m, " indirect dependencies: %11lu\n", - sum_forward_deps); - - /* - * Total number of dependencies: - * - * All irq-safe locks may nest inside irq-unsafe locks, - * plus all the other known dependencies: - */ - seq_printf(m, " all direct dependencies: %11lu\n", - nr_irq_unsafe * nr_irq_safe + - nr_hardirq_unsafe * nr_hardirq_safe + - nr_list_entries); - -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - seq_printf(m, " in-hardirq chains: %11u\n", - nr_hardirq_chains); - seq_printf(m, " in-softirq chains: %11u\n", - nr_softirq_chains); -#endif - seq_printf(m, " in-process chains: %11u\n", - nr_process_chains); - seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", - nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); - seq_printf(m, " combined max dependencies: %11u\n", - (nr_hardirq_chains + 1) * - (nr_softirq_chains + 1) * - (nr_process_chains + 1) - ); - seq_printf(m, " hardirq-safe locks: %11lu\n", - nr_hardirq_safe); - seq_printf(m, " hardirq-unsafe locks: %11lu\n", - nr_hardirq_unsafe); - seq_printf(m, " softirq-safe locks: %11lu\n", - nr_softirq_safe); - seq_printf(m, " softirq-unsafe locks: %11lu\n", - nr_softirq_unsafe); - seq_printf(m, " irq-safe locks: %11lu\n", - nr_irq_safe); - seq_printf(m, " irq-unsafe locks: %11lu\n", - nr_irq_unsafe); - - seq_printf(m, " hardirq-read-safe locks: %11lu\n", - nr_hardirq_read_safe); - seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", - nr_hardirq_read_unsafe); - seq_printf(m, " softirq-read-safe locks: %11lu\n", - nr_softirq_read_safe); - seq_printf(m, " softirq-read-unsafe locks: %11lu\n", - nr_softirq_read_unsafe); - seq_printf(m, " irq-read-safe locks: %11lu\n", - nr_irq_read_safe); - seq_printf(m, " irq-read-unsafe locks: %11lu\n", - nr_irq_read_unsafe); - - seq_printf(m, " uncategorized locks: %11lu\n", - nr_uncategorized); - seq_printf(m, " unused locks: %11lu\n", - nr_unused); - seq_printf(m, " max locking depth: %11u\n", - max_lockdep_depth); -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " max bfs queue depth: %11u\n", - max_bfs_queue_depth); -#endif - lockdep_stats_debug_show(m); - seq_printf(m, " debug_locks: %11u\n", - debug_locks); - - return 0; -} - -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#ifdef CONFIG_LOCK_STAT - -struct lock_stat_data { - struct lock_class *class; - struct lock_class_stats stats; -}; - -struct lock_stat_seq { - struct lock_stat_data *iter_end; - struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; -}; - -/* - * sort on absolute number of contentions - */ -static int lock_stat_cmp(const void *l, const void *r) -{ - const struct lock_stat_data *dl = l, *dr = r; - unsigned long nl, nr; - - nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; - nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; - - return nr - nl; -} - -static void seq_line(struct seq_file *m, char c, int offset, int length) -{ - int i; - - for (i = 0; i < offset; i++) - seq_puts(m, " "); - for (i = 0; i < length; i++) - seq_printf(m, "%c", c); - seq_puts(m, "\n"); -} - -static void snprint_time(char *buf, size_t bufsiz, s64 nr) -{ - s64 div; - s32 rem; - - nr += 5; /* for display rounding */ - div = div_s64_rem(nr, 1000, &rem); - snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); -} - -static void seq_time(struct seq_file *m, s64 time) -{ - char num[15]; - - snprint_time(num, sizeof(num), time); - seq_printf(m, " %14s", num); -} - -static void seq_lock_time(struct seq_file *m, struct lock_time *lt) -{ - seq_printf(m, "%14lu", lt->nr); - seq_time(m, lt->min); - seq_time(m, lt->max); - seq_time(m, lt->total); -} - -static void seq_stats(struct seq_file *m, struct lock_stat_data *data) -{ - char name[39]; - struct lock_class *class; - struct lock_class_stats *stats; - int i, namelen; - - class = data->class; - stats = &data->stats; - - namelen = 38; - if (class->name_version > 1) - namelen -= 2; /* XXX truncates versions > 9 */ - if (class->subclass) - namelen -= 2; - - if (!class->name) { - char str[KSYM_NAME_LEN]; - const char *key_name; - - key_name = __get_key_name(class->key, str); - snprintf(name, namelen, "%s", key_name); - } else { - snprintf(name, namelen, "%s", class->name); - } - namelen = strlen(name); - if (class->name_version > 1) { - snprintf(name+namelen, 3, "#%d", class->name_version); - namelen += 2; - } - if (class->subclass) { - snprintf(name+namelen, 3, "/%d", class->subclass); - namelen += 2; - } - - if (stats->write_holdtime.nr) { - if (stats->read_holdtime.nr) - seq_printf(m, "%38s-W:", name); - else - seq_printf(m, "%40s:", name); - - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); - seq_lock_time(m, &stats->write_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); - seq_lock_time(m, &stats->write_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_holdtime.nr) { - seq_printf(m, "%38s-R:", name); - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); - seq_lock_time(m, &stats->read_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); - seq_lock_time(m, &stats->read_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_waittime.nr + stats->write_waittime.nr == 0) - return; - - if (stats->read_holdtime.nr) - namelen += 2; - - for (i = 0; i < LOCKSTAT_POINTS; i++) { - char ip[32]; - - if (class->contention_point[i] == 0) - break; - - if (!i) - seq_line(m, '-', 40-namelen, namelen); - - snprintf(ip, sizeof(ip), "[<%p>]", - (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %pS\n", - name, stats->contention_point[i], - ip, (void *)class->contention_point[i]); - } - for (i = 0; i < LOCKSTAT_POINTS; i++) { - char ip[32]; - - if (class->contending_point[i] == 0) - break; - - if (!i) - seq_line(m, '-', 40-namelen, namelen); - - snprintf(ip, sizeof(ip), "[<%p>]", - (void *)class->contending_point[i]); - seq_printf(m, "%40s %14lu %29s %pS\n", - name, stats->contending_point[i], - ip, (void *)class->contending_point[i]); - } - if (i) { - seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); - seq_puts(m, "\n"); - } -} - -static void seq_header(struct seq_file *m) -{ - seq_printf(m, "lock_stat version 0.3\n"); - - if (unlikely(!debug_locks)) - seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); - - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " - "%14s %14s\n", - "class name", - "con-bounces", - "contentions", - "waittime-min", - "waittime-max", - "waittime-total", - "acq-bounces", - "acquisitions", - "holdtime-min", - "holdtime-max", - "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "\n"); -} - -static void *ls_start(struct seq_file *m, loff_t *pos) -{ - struct lock_stat_seq *data = m->private; - struct lock_stat_data *iter; - - if (*pos == 0) - return SEQ_START_TOKEN; - - iter = data->stats + (*pos - 1); - if (iter >= data->iter_end) - iter = NULL; - - return iter; -} - -static void *ls_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return ls_start(m, pos); -} - -static void ls_stop(struct seq_file *m, void *v) -{ -} - -static int ls_show(struct seq_file *m, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_header(m); - else - seq_stats(m, v); - - return 0; -} - -static const struct seq_operations lockstat_ops = { - .start = ls_start, - .next = ls_next, - .stop = ls_stop, - .show = ls_show, -}; - -static int lock_stat_open(struct inode *inode, struct file *file) -{ - int res; - struct lock_class *class; - struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); - - if (!data) - return -ENOMEM; - - res = seq_open(file, &lockstat_ops); - if (!res) { - struct lock_stat_data *iter = data->stats; - struct seq_file *m = file->private_data; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - iter->class = class; - iter->stats = lock_stats(class); - iter++; - } - data->iter_end = iter; - - sort(data->stats, data->iter_end - data->stats, - sizeof(struct lock_stat_data), - lock_stat_cmp, NULL); - - m->private = data; - } else - vfree(data); - - return res; -} - -static ssize_t lock_stat_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct lock_class *class; - char c; - - if (count) { - if (get_user(c, buf)) - return -EFAULT; - - if (c != '0') - return count; - - list_for_each_entry(class, &all_lock_classes, lock_entry) - clear_lock_stats(class); - } - return count; -} - -static int lock_stat_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - - vfree(seq->private); - return seq_release(inode, file); -} - -static const struct file_operations proc_lock_stat_operations = { - .open = lock_stat_open, - .write = lock_stat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = lock_stat_release, -}; -#endif /* CONFIG_LOCK_STAT */ - -static int __init lockdep_proc_init(void) -{ - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); -#ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); -#endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - -#ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, - &proc_lock_stat_operations); -#endif - - return 0; -} - -__initcall(lockdep_proc_init); - -/* - Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - -/* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_DEBUG_SET_MODULE_RONX=y - */ -#ifdef CONFIG_DEBUG_SET_MODULE_RONX -# define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif - -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - -/* - * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), - * 2) module_use links, - * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ -DEFINE_MUTEX(module_mutex); -EXPORT_SYMBOL_GPL(module_mutex); -static LIST_HEAD(modules); -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ - - -/* Block module loading/unloading? */ -int modules_disabled = 0; - -/* Waiting for a module to finish initializing? */ -static DECLARE_WAIT_QUEUE_HEAD(module_wq); - -static BLOCKING_NOTIFIER_HEAD(module_notify_list); - -/* Bounds of module allocation, for speeding __module_address. - * Protected by module_mutex. */ -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -int register_module_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_register(&module_notify_list, nb); -} -EXPORT_SYMBOL(register_module_notifier); - -int unregister_module_notifier(struct notifier_block * nb) -{ - return blocking_notifier_chain_unregister(&module_notify_list, nb); -} -EXPORT_SYMBOL(unregister_module_notifier); - -struct load_info { - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs; - struct _ddebug *debug; - unsigned int num_debug; - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - -/* We require a truly strong try_module_get(): 0 means failure due to - ongoing or failed initialization etc. */ -static inline int strong_try_module_get(struct module *mod) -{ - if (mod && mod->state == MODULE_STATE_COMING) - return -EBUSY; - if (try_module_get(mod)) - return 0; - else - return -ENOENT; -} - -static inline void add_taint_module(struct module *mod, unsigned flag) -{ - add_taint(flag); - mod->taints |= (1U << flag); -} - -/* - * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. nfsd and lockd use this. - */ -void __module_put_and_exit(struct module *mod, long code) -{ - module_put(mod); - do_exit(code); -} -EXPORT_SYMBOL(__module_put_and_exit); - -/* Find a module section: 0 means not found. */ -static unsigned int find_sec(const struct load_info *info, const char *name) -{ - unsigned int i; - - for (i = 1; i < info->hdr->e_shnum; i++) { - Elf_Shdr *shdr = &info->sechdrs[i]; - /* Alloc bit cleared means "ignore it." */ - if ((shdr->sh_flags & SHF_ALLOC) - && strcmp(info->secstrings + shdr->sh_name, name) == 0) - return i; - } - return 0; -} - -/* Find a module section, or NULL. */ -static void *section_addr(const struct load_info *info, const char *name) -{ - /* Section 0 has sh_addr 0. */ - return (void *)info->sechdrs[find_sec(info, name)].sh_addr; -} - -/* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(const struct load_info *info, - const char *name, - size_t object_size, - unsigned int *num) -{ - unsigned int sec = find_sec(info, name); - - /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = info->sechdrs[sec].sh_size / object_size; - return (void *)info->sechdrs[sec].sh_addr; -} - -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const struct kernel_symbol __start___ksymtab_gpl_future[]; -extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; -extern const unsigned long __start___kcrctab[]; -extern const unsigned long __start___kcrctab_gpl[]; -extern const unsigned long __start___kcrctab_gpl_future[]; -#ifdef CONFIG_UNUSED_SYMBOLS -extern const struct kernel_symbol __start___ksymtab_unused[]; -extern const struct kernel_symbol __stop___ksymtab_unused[]; -extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; -extern const unsigned long __start___kcrctab_unused[]; -extern const unsigned long __start___kcrctab_unused_gpl[]; -#endif - -#ifndef CONFIG_MODVERSIONS -#define symversion(base, idx) NULL -#else -#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) -#endif - -static bool each_symbol_in_section(const struct symsearch *arr, - unsigned int arrsize, - struct module *owner, - bool (*fn)(const struct symsearch *syms, - struct module *owner, - void *data), - void *data) -{ - unsigned int j; - - for (j = 0; j < arrsize; j++) { - if (fn(&arr[j], owner, data)) - return true; - } - - return false; -} - -/* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, - struct module *owner, - void *data), - void *data) -{ - struct module *mod; - static const struct symsearch arr[] = { - { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - NOT_GPL_ONLY, false }, - { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, - GPL_ONLY, false }, - { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, - __start___kcrctab_gpl_future, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { __start___ksymtab_unused, __stop___ksymtab_unused, - __start___kcrctab_unused, - NOT_GPL_ONLY, true }, - { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, - __start___kcrctab_unused_gpl, - GPL_ONLY, true }, -#endif - }; - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) - return true; - - list_for_each_entry_rcu(mod, &modules, list) { - struct symsearch arr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY, false }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, - GPL_ONLY, false }, - { mod->gpl_future_syms, - mod->gpl_future_syms + mod->num_gpl_future_syms, - mod->gpl_future_crcs, - WILL_BE_GPL_ONLY, false }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, - mod->unused_syms + mod->num_unused_syms, - mod->unused_crcs, - NOT_GPL_ONLY, true }, - { mod->unused_gpl_syms, - mod->unused_gpl_syms + mod->num_unused_gpl_syms, - mod->unused_gpl_crcs, - GPL_ONLY, true }, -#endif - }; - - if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) - return true; - } - return false; -} -EXPORT_SYMBOL_GPL(each_symbol_section); - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const unsigned long *crc; - const struct kernel_symbol *sym; -}; - -static bool check_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) -{ - struct find_symbol_arg *fsa = data; - - if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) - return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); - } - } - -#ifdef CONFIG_UNUSED_SYMBOLS - if (syms->unused && fsa->warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", fsa->name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); - } -#endif - - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->sym = &syms->start[symnum]; - return true; -} - -static int cmp_name(const void *va, const void *vb) -{ - const char *a; - const struct kernel_symbol *b; - a = va; b = vb; - return strcmp(a, b->name); -} - -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - void *data) -{ - struct find_symbol_arg *fsa = data; - struct kernel_symbol *sym; - - sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, - sizeof(struct kernel_symbol), cmp_name); - - if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) - return true; - - return false; -} - -/* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -const struct kernel_symbol *find_symbol(const char *name, - struct module **owner, - const unsigned long **crc, - bool gplok, - bool warn) -{ - struct find_symbol_arg fsa; - - fsa.name = name; - fsa.gplok = gplok; - fsa.warn = warn; - - if (each_symbol_section(find_symbol_in_section, &fsa)) { - if (owner) - *owner = fsa.owner; - if (crc) - *crc = fsa.crc; - return fsa.sym; - } - - pr_debug("Failed to find symbol %s\n", name); - return NULL; -} -EXPORT_SYMBOL_GPL(find_symbol); - -/* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) -{ - struct module *mod; - - list_for_each_entry(mod, &modules, list) { - if (strcmp(mod->name, name) == 0) - return mod; - } - return NULL; -} -EXPORT_SYMBOL_GPL(find_module); - -#ifdef CONFIG_SMP - -static inline void __percpu *mod_percpu(struct module *mod) -{ - return mod->percpu; -} - -static int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) -{ - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - mod->name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - mod->percpu = __alloc_reserved_percpu(size, align); - if (!mod->percpu) { - printk(KERN_WARNING - "%s: Could not allocate %lu bytes percpu data\n", - mod->name, size); - return -ENOMEM; - } - mod->percpu_size = size; - return 0; -} - -static void percpu_modfree(struct module *mod) -{ - free_percpu(mod->percpu); -} - -static unsigned int find_pcpusec(struct load_info *info) -{ - return find_sec(info, ".data..percpu"); -} - -static void percpu_modcopy(struct module *mod, - const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); -} - -/** - * is_module_percpu_address - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * RETURNS: - * %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) -{ - struct module *mod; - unsigned int cpu; - - preempt_disable(); - - list_for_each_entry_rcu(mod, &modules, list) { - if (!mod->percpu_size) - continue; - for_each_possible_cpu(cpu) { - void *start = per_cpu_ptr(mod->percpu, cpu); - - if ((void *)addr >= start && - (void *)addr < start + mod->percpu_size) { - preempt_enable(); - return true; - } - } - } - - preempt_enable(); - return false; -} - -#else /* ... !CONFIG_SMP */ - -static inline void __percpu *mod_percpu(struct module *mod) -{ - return NULL; -} -static inline int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) -{ - return -ENOMEM; -} -static inline void percpu_modfree(struct module *mod) -{ -} -static unsigned int find_pcpusec(struct load_info *info) -{ - return 0; -} -static inline void percpu_modcopy(struct module *mod, - const void *from, unsigned long size) -{ - /* pcpusec should be 0, and size of that section should be 0. */ - BUG_ON(size != 0); -} -bool is_module_percpu_address(unsigned long addr) -{ - return false; -} - -#endif /* CONFIG_SMP */ - -#define MODINFO_ATTR(field) \ -static void setup_modinfo_##field(struct module *mod, const char *s) \ -{ \ - mod->field = kstrdup(s, GFP_KERNEL); \ -} \ -static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ - struct module_kobject *mk, char *buffer) \ -{ \ - return sprintf(buffer, "%s\n", mk->mod->field); \ -} \ -static int modinfo_##field##_exists(struct module *mod) \ -{ \ - return mod->field != NULL; \ -} \ -static void free_modinfo_##field(struct module *mod) \ -{ \ - kfree(mod->field); \ - mod->field = NULL; \ -} \ -static struct module_attribute modinfo_##field = { \ - .attr = { .name = __stringify(field), .mode = 0444 }, \ - .show = show_modinfo_##field, \ - .setup = setup_modinfo_##field, \ - .test = modinfo_##field##_exists, \ - .free = free_modinfo_##field, \ -}; - -MODINFO_ATTR(version); -MODINFO_ATTR(srcversion); - -static char last_unloaded_module[MODULE_NAME_LEN+1]; - -#ifdef CONFIG_MODULE_UNLOAD - -EXPORT_TRACEPOINT_SYMBOL(module_get); - -/* Init the unload section of the module. */ -static int module_unload_init(struct module *mod) -{ - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) - return -ENOMEM; - - INIT_LIST_HEAD(&mod->source_list); - INIT_LIST_HEAD(&mod->target_list); - - /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; - - return 0; -} - -/* Does a already use b? */ -static int already_uses(struct module *a, struct module *b) -{ - struct module_use *use; - - list_for_each_entry(use, &b->source_list, source_list) { - if (use->source == a) { - pr_debug("%s uses %s!\n", a->name, b->name); - return 1; - } - } - pr_debug("%s does not use %s!\n", a->name, b->name); - return 0; -} - -/* - * Module a uses b - * - we add 'a' as a "source", 'b' as a "target" of module use - * - the module_use is added to the list of 'b' sources (so - * 'b' can walk the list to see who sourced them), and of 'a' - * targets (so 'a' can see what modules it targets). - */ -static int add_module_usage(struct module *a, struct module *b) -{ - struct module_use *use; - - pr_debug("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk(KERN_WARNING "%s: out of memory loading\n", a->name); - return -ENOMEM; - } - - use->source = a; - use->target = b; - list_add(&use->source_list, &b->source_list); - list_add(&use->target_list, &a->target_list); - return 0; -} - -/* Module a uses b: caller needs module_mutex() */ -int ref_module(struct module *a, struct module *b) -{ - int err; - - if (b == NULL || already_uses(a, b)) - return 0; - - /* If module isn't available, we fail. */ - err = strong_try_module_get(b); - if (err) - return err; - - err = add_module_usage(a, b); - if (err) { - module_put(b); - return err; - } - return 0; -} -EXPORT_SYMBOL_GPL(ref_module); - -/* Clear the unload stuff of the module. */ -static void module_unload_free(struct module *mod) -{ - struct module_use *use, *tmp; - - mutex_lock(&module_mutex); - list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { - struct module *i = use->target; - pr_debug("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->source_list); - list_del(&use->target_list); - kfree(use); - } - mutex_unlock(&module_mutex); - - free_percpu(mod->refptr); -} - -#ifdef CONFIG_MODULE_FORCE_UNLOAD -static inline int try_force_unload(unsigned int flags) -{ - int ret = (flags & O_TRUNC); - if (ret) - add_taint(TAINT_FORCED_RMMOD); - return ret; -} -#else -static inline int try_force_unload(unsigned int flags) -{ - return 0; -} -#endif /* CONFIG_MODULE_FORCE_UNLOAD */ - -struct stopref -{ - struct module *mod; - int flags; - int *forced; -}; - -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; - - /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) - return -EWOULDBLOCK; - } - - /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; - - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } -} - -unsigned long module_refcount(struct module *mod) -{ - unsigned long incs = 0, decs = 0; - int cpu; - - for_each_possible_cpu(cpu) - decs += per_cpu_ptr(mod->refptr, cpu)->decs; - /* - * ensure the incs are added up after the decs. - * module_put ensures incs are visible before decs with smp_wmb. - * - * This 2-count scheme avoids the situation where the refcount - * for CPU0 is read, then CPU0 increments the module refcount, - * then CPU1 drops that refcount, then the refcount for CPU1 is - * read. We would record a decrement but not its corresponding - * increment so we would see a low count (disaster). - * - * Rare situation? But module_refcount can be preempted, and we - * might be tallying up 4096+ CPUs. So it is not impossible. - */ - smp_rmb(); - for_each_possible_cpu(cpu) - incs += per_cpu_ptr(mod->refptr, cpu)->incs; - return incs - decs; -} -EXPORT_SYMBOL(module_refcount); - -/* This exists whether we can unload or not */ -static void free_module(struct module *mod); - -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - pr_debug("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - -SYSCALL_DEFINE2(delete_module, const char __user *, name_user, - unsigned int, flags) -{ - struct module *mod; - char name[MODULE_NAME_LEN]; - int ret, forced = 0; - - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; - - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; - - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - - mod = find_module(name); - if (!mod) { - ret = -ENOENT; - goto out; - } - - if (!list_empty(&mod->source_list)) { - /* Other modules depend on us: get rid of them first. */ - ret = -EWOULDBLOCK; - goto out; - } - - /* Doing init or already dying? */ - if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ - pr_debug("%s already dying\n", mod->name); - ret = -EBUSY; - goto out; - } - - /* If it has an init func, it must have an exit func to unload */ - if (mod->init && !mod->exit) { - forced = try_force_unload(flags); - if (!forced) { - /* This module can't be removed */ - ret = -EBUSY; - goto out; - } - } - - /* Set this up before setting mod->state */ - mod->waiter = current; - - /* Stop the machine so refcounts can't move and disable module. */ - ret = try_stop_module(mod, flags, &forced); - if (ret != 0) - goto out; - - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - - mutex_unlock(&module_mutex); - /* Final destruction now no one is using it. */ - if (mod->exit != NULL) - mod->exit(); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - async_synchronize_full(); - - /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - - free_module(mod); - return 0; -out: - mutex_unlock(&module_mutex); - return ret; -} - -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %lu ", module_refcount(mod)); - - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->source_list, source_list) { - printed_something = 1; - seq_printf(m, "%s,", use->source->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_printf(m, "[permanent],"); - } - - if (!printed_something) - seq_printf(m, "-"); -} - -void __symbol_put(const char *symbol) -{ - struct module *owner; - - preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, true, false)) - BUG(); - module_put(owner); - preempt_enable(); -} -EXPORT_SYMBOL(__symbol_put); - -/* Note this assumes addr is a function, which it currently always is. */ -void symbol_put_addr(void *addr) -{ - struct module *modaddr; - unsigned long a = (unsigned long)dereference_function_descriptor(addr); - - if (core_kernel_text(a)) - return; - - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ - modaddr = __module_text_address(a); - BUG_ON(!modaddr); - module_put(modaddr); -} -EXPORT_SYMBOL_GPL(symbol_put_addr); - -static ssize_t show_refcnt(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); -} - -static struct module_attribute modinfo_refcnt = - __ATTR(refcnt, 0444, show_refcnt, NULL); - -void module_put(struct module *module) -{ - if (module) { - preempt_disable(); - smp_wmb(); /* see comment in module_refcount */ - __this_cpu_inc(module->refptr->decs); - - trace_module_put(module, _RET_IP_); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); - preempt_enable(); - } -} -EXPORT_SYMBOL(module_put); - -#else /* !CONFIG_MODULE_UNLOAD */ -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); -} - -static inline void module_unload_free(struct module *mod) -{ -} - -int ref_module(struct module *a, struct module *b) -{ - return strong_try_module_get(b); -} -EXPORT_SYMBOL_GPL(ref_module); - -static inline int module_unload_init(struct module *mod) -{ - return 0; -} -#endif /* CONFIG_MODULE_UNLOAD */ - -static size_t module_flags_taint(struct module *mod, char *buf) -{ - size_t l = 0; - - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - return l; -} - -static ssize_t show_initstate(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - const char *state = "unknown"; - - switch (mk->mod->state) { - case MODULE_STATE_LIVE: - state = "live"; - break; - case MODULE_STATE_COMING: - state = "coming"; - break; - case MODULE_STATE_GOING: - state = "going"; - break; - } - return sprintf(buffer, "%s\n", state); -} - -static struct module_attribute modinfo_initstate = - __ATTR(initstate, 0444, show_initstate, NULL); - -static ssize_t store_uevent(struct module_attribute *mattr, - struct module_kobject *mk, - const char *buffer, size_t count) -{ - enum kobject_action action; - - if (kobject_action_type(buffer, count, &action) == 0) - kobject_uevent(&mk->kobj, action); - return count; -} - -struct module_attribute module_uevent = - __ATTR(uevent, 0200, NULL, store_uevent); - -static ssize_t show_coresize(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%u\n", mk->mod->core_size); -} - -static struct module_attribute modinfo_coresize = - __ATTR(coresize, 0444, show_coresize, NULL); - -static ssize_t show_initsize(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - return sprintf(buffer, "%u\n", mk->mod->init_size); -} - -static struct module_attribute modinfo_initsize = - __ATTR(initsize, 0444, show_initsize, NULL); - -static ssize_t show_taint(struct module_attribute *mattr, - struct module_kobject *mk, char *buffer) -{ - size_t l; - - l = module_flags_taint(mk->mod, buffer); - buffer[l++] = '\n'; - return l; -} - -static struct module_attribute modinfo_taint = - __ATTR(taint, 0444, show_taint, NULL); - -static struct module_attribute *modinfo_attrs[] = { - &module_uevent, - &modinfo_version, - &modinfo_srcversion, - &modinfo_initstate, - &modinfo_coresize, - &modinfo_initsize, - &modinfo_taint, -#ifdef CONFIG_MODULE_UNLOAD - &modinfo_refcnt, -#endif - NULL, -}; - -static const char vermagic[] = VERMAGIC_STRING; - -static int try_to_force_load(struct module *mod, const char *reason) -{ -#ifdef CONFIG_MODULE_FORCE_LOAD - if (!test_taint(TAINT_FORCED_MODULE)) - printk(KERN_WARNING "%s: %s: kernel tainted.\n", - mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE); - return 0; -#else - return -ENOEXEC; -#endif -} - -#ifdef CONFIG_MODVERSIONS -/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ -static unsigned long maybe_relocated(unsigned long crc, - const struct module *crc_owner) -{ -#ifdef ARCH_RELOCATES_KCRCTAB - if (crc_owner == NULL) - return crc - (unsigned long)reloc_start; -#endif - return crc; -} - -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *symname, - struct module *mod, - const unsigned long *crc, - const struct module *crc_owner) -{ - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (versions[i].crc == maybe_relocated(*crc, crc_owner)) - return 1; - pr_debug("Found checksum %lX vs module %lX\n", - maybe_relocated(*crc, crc_owner), versions[i].crc); - goto bad_version; - } - - printk(KERN_WARNING "%s: no symbol version for %s\n", - mod->name, symname); - return 0; - -bad_version: - printk("%s: disagrees about version of symbol %s\n", - mod->name, symname); - return 0; -} - -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, - struct module *mod) -{ - const unsigned long *crc; - - /* Since this should be found in kernel (which can't be removed), - * no locking is necessary. */ - if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, - &crc, true, false)) - BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc, - NULL); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *symname, - struct module *mod, - const unsigned long *crc, - const struct module *crc_owner) -{ - return 1; -} - -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - -/* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(struct module *mod, - const struct load_info *info, - const char *name, - char ownername[]) -{ - struct module *owner; - const struct kernel_symbol *sym; - const unsigned long *crc; - int err; - - mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, - !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - if (!sym) - goto unlock; - - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, - owner)) { - sym = ERR_PTR(-EINVAL); - goto getname; - } - - err = ref_module(mod, owner); - if (err) { - sym = ERR_PTR(err); - goto getname; - } - -getname: - /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(owner), MODULE_NAME_LEN); -unlock: - mutex_unlock(&module_mutex); - return sym; -} - -static const struct kernel_symbol * -resolve_symbol_wait(struct module *mod, - const struct load_info *info, - const char *name) -{ - const struct kernel_symbol *ksym; - char owner[MODULE_NAME_LEN]; - - if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) - || PTR_ERR(ksym) != -EBUSY, - 30 * HZ) <= 0) { - printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, owner); - } - return ksym; -} - -/* - * /sys/module/foo/sections stuff - * J. Corbet - */ -#ifdef CONFIG_SYSFS - -#ifdef CONFIG_KALLSYMS -static inline bool sect_empty(const Elf_Shdr *sect) -{ - return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; -} - -struct module_sect_attr -{ - struct module_attribute mattr; - char *name; - unsigned long address; -}; - -struct module_sect_attrs -{ - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[0]; -}; - -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) -{ - struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%pK\n", (void *)sattr->address); -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i])) - nloaded++; - size[0] = ALIGN(sizeof(*sect_attrs) - + nloaded * sizeof(sect_attrs->attrs[0]), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < info->hdr->e_shnum; i++) { - Elf_Shdr *sec = &info->sechdrs[i]; - if (sect_empty(sec)) - continue; - sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) - goto out; - sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUGO; - *(gattr++) = &(sattr++)->mattr.attr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[0]; -}; - -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i]) && - (info->sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(sizeof(*notes_attrs) - + notes * sizeof(notes_attrs->attrs[0]), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { - if (sect_empty(&info->sechdrs[i])) - continue; - if (info->sechdrs[i].sh_type == SHT_NOTE) { - sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; - nattr->attr.mode = S_IRUGO; - nattr->size = info->sechdrs[i].sh_size; - nattr->private = (void *) info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void add_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - int nowarn; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } - mutex_unlock(&module_mutex); -#endif -} - -static void del_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); - mutex_unlock(&module_mutex); -#endif -} - -static int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); - ++temp_attr; - } - } - return error; -} - -static void module_remove_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -static int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - printk(KERN_ERR "%s: module sysfs not initialized\n", - mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - printk(KERN_ERR "%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - kobject_put(&mod->mkobj.kobj); - - /* delay uevent until full sysfs population */ -out: - return err; -} - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - err = mod_sysfs_init(mod); - if (err) - goto out; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - add_usage_links(mod); - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); - - kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); - return 0; - -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - kobject_put(&mod->mkobj.kobj); -out: - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - remove_notes_attrs(mod); - remove_sect_attrs(mod); - kobject_put(&mod->mkobj.kobj); -} - -#else /* !CONFIG_SYSFS */ - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static void mod_sysfs_fini(struct module *mod) -{ -} - -static void module_remove_modinfo_attrs(struct module *mod) -{ -} - -static void del_usage_links(struct module *mod) -{ -} - -#endif /* CONFIG_SYSFS */ - -static void mod_sysfs_teardown(struct module *mod) -{ - del_usage_links(mod); - module_remove_modinfo_attrs(mod); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - module_bug_cleanup(mod); - return 0; -} - -#ifdef CONFIG_DEBUG_SET_MODULE_RONX -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) -{ - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); - - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); -} - -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size) -{ - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; - - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); - - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } -} - -static void unset_module_core_ro_nx(struct module *mod) -{ - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); -} - -static void unset_module_init_ro_nx(struct module *mod) -{ - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); -} - -/* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_rw); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_rw); - } - } - mutex_unlock(&module_mutex); -} - -/* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry_rcu(mod, &modules, list) { - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_ro); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_ro); - } - } - mutex_unlock(&module_mutex); -} -#else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } -#endif - -void __weak module_free(struct module *mod, void *module_region) -{ - vfree(module_region); -} - -void __weak module_arch_cleanup(struct module *mod) -{ -} - -/* Free a module, remove from lists, etc. */ -static void free_module(struct module *mod) -{ - trace_module_free(mod); - - /* Delete from various lists */ - mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); - mutex_unlock(&module_mutex); - mod_sysfs_teardown(mod); - - /* Remove dynamic debug info */ - ddebug_remove_module(mod->name); - - /* Arch-specific cleanup. */ - module_arch_cleanup(mod); - - /* Module unload stuff */ - module_unload_free(mod); - - /* Free any allocated parameters. */ - destroy_params(mod->kp, mod->num_kp); - - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - kfree(mod->args); - percpu_modfree(mod); - - /* Free lock-classes: */ - lockdep_free_key_range(mod->module_core, mod->core_size); - - /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); - -#ifdef CONFIG_MPU - update_protections(current->mm); -#endif -} - -void *__symbol_get(const char *symbol) -{ - struct module *owner; - const struct kernel_symbol *sym; - - preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, true, true); - if (sym && strong_try_module_get(owner)) - sym = NULL; - preempt_enable(); - - return sym ? (void *)sym->value : NULL; -} -EXPORT_SYMBOL_GPL(__symbol_get); - -/* - * Ensure that an exported symbol [global namespace] does not already exist - * in the kernel or in some other module's exported symbol table. - * - * You must hold the module_mutex. - */ -static int verify_export_symbols(struct module *mod) -{ - unsigned int i; - struct module *owner; - const struct kernel_symbol *s; - struct { - const struct kernel_symbol *sym; - unsigned int num; - } arr[] = { - { mod->syms, mod->num_syms }, - { mod->gpl_syms, mod->num_gpl_syms }, - { mod->gpl_future_syms, mod->num_gpl_future_syms }, -#ifdef CONFIG_UNUSED_SYMBOLS - { mod->unused_syms, mod->num_unused_syms }, - { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, -#endif - }; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { - printk(KERN_ERR - "%s: exports duplicate symbol %s" - " (owned by %s)\n", - mod->name, s->name, module_name(owner)); - return -ENOEXEC; - } - } - } - return 0; -} - -/* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(struct module *mod, const struct load_info *info) -{ - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - Elf_Sym *sym = (void *)symsec->sh_addr; - unsigned long secbase; - unsigned int i; - int ret = 0; - const struct kernel_symbol *ksym; - - for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { - const char *name = info->strtab + sym[i].st_name; - - switch (sym[i].st_shndx) { - case SHN_COMMON: - /* We compiled with -fno-common. These are not - supposed to happen. */ - pr_debug("Common symbol: %s\n", name); - printk("%s: please compile with -fno-common\n", - mod->name); - ret = -ENOEXEC; - break; - - case SHN_ABS: - /* Don't need to do anything */ - pr_debug("Absolute symbol: 0x%08lx\n", - (long)sym[i].st_value); - break; - - case SHN_UNDEF: - ksym = resolve_symbol_wait(mod, info, name); - /* Ok if resolved. */ - if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; - break; - } - - /* Ok if weak. */ - if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) - break; - - printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); - ret = PTR_ERR(ksym) ?: -ENOENT; - break; - - default: - /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == info->index.pcpu) - secbase = (unsigned long)mod_percpu(mod); - else - secbase = info->sechdrs[sym[i].st_shndx].sh_addr; - sym[i].st_value += secbase; - break; - } - } - - return ret; -} - -int __weak apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: REL relocation unsupported\n", me->name); - return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: RELA relocation unsupported\n", me->name); - return -ENOEXEC; -} - -static int apply_relocations(struct module *mod, const struct load_info *info) -{ - unsigned int i; - int err = 0; - - /* Now do relocations. */ - for (i = 1; i < info->hdr->e_shnum; i++) { - unsigned int infosec = info->sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (infosec >= info->hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) - continue; - - if (info->sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(info->sechdrs, info->strtab, - info->index.sym, i, mod); - else if (info->sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(info->sechdrs, info->strtab, - info->index.sym, i, mod); - if (err < 0) - break; - } - return err; -} - -/* Additional bytes needed by arch in front of individual sections */ -unsigned int __weak arch_mod_section_prepend(struct module *mod, - unsigned int section) -{ - /* default implementation just returns zero */ - return 0; -} - -/* Update size with this section: return offset. */ -static long get_offset(struct module *mod, unsigned int *size, - Elf_Shdr *sechdr, unsigned int section) -{ - long ret; - - *size += arch_mod_section_prepend(mod, section); - ret = ALIGN(*size, sechdr->sh_addralign ?: 1); - *size = ret + sechdr->sh_size; - return ret; -} - -/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld - might -- code, read-only data, read-write data, small data. Tally - sizes, and place the offsets into sh_entsize fields: high bit means it - belongs in init. */ -static void layout_sections(struct module *mod, struct load_info *info) -{ - static unsigned long const masks[][2] = { - /* NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below */ - { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, - { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, - { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, - { ARCH_SHF_SMALL | SHF_ALLOC, 0 } - }; - unsigned int m, i; - - for (i = 0; i < info->hdr->e_shnum; i++) - info->sechdrs[i].sh_entsize = ~0UL; - - pr_debug("Core section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < info->hdr->e_shnum; ++i) { - Elf_Shdr *s = &info->sechdrs[i]; - const char *sname = info->secstrings + s->sh_name; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) - continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - pr_debug("\t%s\n", sname); - } - switch (m) { - case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; - break; - case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; - break; - case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); - break; - } - } - - pr_debug("Init section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < info->hdr->e_shnum; ++i) { - Elf_Shdr *s = &info->sechdrs[i]; - const char *sname = info->secstrings + s->sh_name; - - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) - continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) - | INIT_OFFSET_MASK); - pr_debug("\t%s\n", sname); - } - switch (m) { - case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; - break; - case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; - break; - case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); - break; - } - } -} - -static void set_license(struct module *mod, const char *license) -{ - if (!license) - license = "unspecified"; - - if (!license_is_gpl_compatible(license)) { - if (!test_taint(TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints " - "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - } -} - -/* Parse tag=value strings from .modinfo section */ -static char *next_string(char *string, unsigned long *secsize) -{ - /* Skip non-zero chars */ - while (string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - - /* Skip any zero padding. */ - while (!string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - return string; -} - -static char *get_modinfo(struct load_info *info, const char *tag) -{ - char *p; - unsigned int taglen = strlen(tag); - Elf_Shdr *infosec = &info->sechdrs[info->index.info]; - unsigned long size = infosec->sh_size; - - for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { - if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') - return p + taglen + 1; - } - return NULL; -} - -static void setup_modinfo(struct module *mod, struct load_info *info) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (attr->setup) - attr->setup(mod, get_modinfo(info, attr->attr.name)); - } -} - -static void free_modinfo(struct module *mod) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (attr->free) - attr->free(mod); - } -} - -#ifdef CONFIG_KALLSYMS - -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - return bsearch(name, start, stop - start, - sizeof(struct kernel_symbol), cmp_name); -} - -static int is_exported(const char *name, unsigned long value, - const struct module *mod) -{ - const struct kernel_symbol *ks; - if (!mod) - ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); - else - ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, const struct load_info *info) -{ - const Elf_Shdr *sechdrs = info->sechdrs; - - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug")) { - return 'n'; - } - return '?'; -} - -static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) -{ - const Elf_Shdr *sec; - - if (src->st_shndx == SHN_UNDEF - || src->st_shndx >= shnum - || !src->st_name) - return false; - - sec = sechdrs + src->st_shndx; - if (!(sec->sh_flags & SHF_ALLOC) -#ifndef CONFIG_KALLSYMS_ALL - || !(sec->sh_flags & SHF_EXECINSTR) -#endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) - return false; - - return true; -} - -/* - * We only allocate and copy the strings needed by the parts of symtab - * we keep. This is simple, but has the effect of making multiple - * copies of duplicates. We could be more sophisticated, see - * linux-kernel thread starting with - * <73defb5e4bca04a6431392cc341112b1@localhost>. - */ -static void layout_symtab(struct module *mod, struct load_info *info) -{ - Elf_Shdr *symsect = info->sechdrs + info->index.sym; - Elf_Shdr *strsect = info->sechdrs + info->index.str; - const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size; - - /* Put symbol section at end of init part of module. */ - symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, - info->index.sym) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + symsect->sh_name); - - src = (void *)info->hdr + symsect->sh_offset; - nsrc = symsect->sh_size / sizeof(*src); - - /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - strtab_size += strlen(&info->strtab[src->st_name]) + 1; - ndst++; - } - - /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - - /* Put string table section at end of init part of module. */ - strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, - info->index.str) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + strsect->sh_name); -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ - unsigned int i, ndst; - const Elf_Sym *src; - Elf_Sym *dst; - char *s; - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); - /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; - src = mod->symtab; - *dst = *src; - *s++ = 0; - for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) - continue; - - dst[ndst] = *src; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; - } - mod->core_num_syms = ndst; -} -#else -static inline void layout_symtab(struct module *mod, struct load_info *info) -{ -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) -{ - if (!debug) - return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, debug->modname)) - printk(KERN_ERR "dynamic debug error adding module: %s\n", - debug->modname); -#endif -} - -static void dynamic_debug_remove(struct _ddebug *debug) -{ - if (debug) - ddebug_remove_module(debug->modname); -} - -void * __weak module_alloc(unsigned long size) -{ - return size == 0 ? NULL : vmalloc_exec(size); -} - -static void *module_alloc_update_bounds(unsigned long size) -{ - void *ret = module_alloc(size); - - if (ret) { - mutex_lock(&module_mutex); - /* Update module bounds. */ - if ((unsigned long)ret < module_addr_min) - module_addr_min = (unsigned long)ret; - if ((unsigned long)ret + size > module_addr_max) - module_addr_max = (unsigned long)ret + size; - mutex_unlock(&module_mutex); - } - return ret; -} - -#ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - const char *name = info->secstrings + info->sechdrs[i].sh_name; - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!strstarts(name, ".data") && !strstarts(name, ".bss")) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); - } -} -#else -static inline void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ -} -#endif - -/* Sets info->hdr and info->len. */ -static int copy_and_check(struct load_info *info, - const void __user *umod, unsigned long len, - const char __user *uargs) -{ - int err; - Elf_Ehdr *hdr; - - if (len < sizeof(*hdr)) - return -ENOEXEC; - - /* Suck in entire file: we'll want most of it. */ - /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return -ENOMEM; - - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; - } - - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; - } - - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; - } - - info->hdr = hdr; - info->len = len; - return 0; - -free_hdr: - vfree(hdr); - return err; -} - -static void free_copy(struct load_info *info) -{ - vfree(info->hdr); -} - -static int rewrite_section_headers(struct load_info *info) -{ - unsigned int i; - - /* This should always be true, but let's be sure. */ - info->sechdrs[0].sh_addr = 0; - - for (i = 1; i < info->hdr->e_shnum; i++) { - Elf_Shdr *shdr = &info->sechdrs[i]; - if (shdr->sh_type != SHT_NOBITS - && info->len < shdr->sh_offset + shdr->sh_size) { - printk(KERN_ERR "Module len %lu truncated\n", - info->len); - return -ENOEXEC; - } - - /* Mark all sections sh_addr with their address in the - temporary image. */ - shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - -#ifndef CONFIG_MODULE_UNLOAD - /* Don't load .exit sections */ - if (strstarts(info->secstrings+shdr->sh_name, ".exit")) - shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; -#endif - } - - /* Track but don't keep modinfo and version sections. */ - info->index.vers = find_sec(info, "__versions"); - info->index.info = find_sec(info, ".modinfo"); - info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; - return 0; -} - -/* - * Set up our basic convenience variables (pointers to section headers, - * search for module section index etc), and do some basic section - * verification. - * - * Return the temporary module pointer (we'll replace it with the final - * one when we move the module sections around). - */ -static struct module *setup_load_info(struct load_info *info) -{ - unsigned int i; - int err; - struct module *mod; - - /* Set up the convenience variables */ - info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; - info->secstrings = (void *)info->hdr - + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - - err = rewrite_section_headers(info); - if (err) - return ERR_PTR(err); - - /* Find internal symbols and strings. */ - for (i = 1; i < info->hdr->e_shnum; i++) { - if (info->sechdrs[i].sh_type == SHT_SYMTAB) { - info->index.sym = i; - info->index.str = info->sechdrs[i].sh_link; - info->strtab = (char *)info->hdr - + info->sechdrs[info->index.str].sh_offset; - break; - } - } - - info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); - if (!info->index.mod) { - printk(KERN_WARNING "No module found in object\n"); - return ERR_PTR(-ENOEXEC); - } - /* This is temporary: point mod into copy of data. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; - - if (info->index.sym == 0) { - printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", - mod->name); - return ERR_PTR(-ENOEXEC); - } - - info->index.pcpu = find_pcpusec(info); - - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) - return ERR_PTR(-ENOEXEC); - - return mod; -} - -static int check_modinfo(struct module *mod, struct load_info *info) -{ - const char *modmagic = get_modinfo(info, "vermagic"); - int err; - - /* This is allowed: modprobe --force will invalidate it. */ - if (!modmagic) { - err = try_to_force_load(mod, "bad vermagic"); - if (err) - return err; - } else if (!same_magic(modmagic, vermagic, info->index.vers)) { - printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); - return -ENOEXEC; - } - - if (!get_modinfo(info, "intree")) - add_taint_module(mod, TAINT_OOT_MODULE); - - if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP); - printk(KERN_WARNING "%s: module is from the staging directory," - " the quality is unknown, you have been warned.\n", - mod->name); - } - - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(info, "license")); - - return 0; -} - -static void find_module_sections(struct module *mod, struct load_info *info) -{ - mod->kp = section_objs(info, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(info, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(info, "__kcrctab"); - mod->gpl_syms = section_objs(info, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(info, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(info, "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(info, "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); -#endif -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(info, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); -#endif - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", - sizeof(*mod->tracepoints_ptrs), - &mod->num_tracepoints); -#endif -#ifdef HAVE_JUMP_LABEL - mod->jump_entries = section_objs(info, "__jump_table", - sizeof(*mod->jump_entries), - &mod->num_jump_entries); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(info, "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); -#endif -#ifdef CONFIG_TRACING - mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", - sizeof(*mod->trace_bprintk_fmt_start), - &mod->num_trace_bprintk_fmt); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_bprintk_fmt_start, - sizeof(*mod->trace_bprintk_fmt_start) * - mod->num_trace_bprintk_fmt, GFP_KERNEL); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(info, "__mcount_loc", - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif - - mod->extable = section_objs(info, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - - if (section_addr(info, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); - - info->debug = section_objs(info, "__verbose", - sizeof(*info->debug), &info->num_debug); -} - -static int move_module(struct module *mod, struct load_info *info) -{ - int i; - void *ptr; - - /* Do the allocs. */ - ptr = module_alloc_update_bounds(mod->core_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. Just mark it as not being a - * leak. - */ - kmemleak_not_leak(ptr); - if (!ptr) - return -ENOMEM; - - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; - - ptr = module_alloc_update_bounds(mod->init_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr && mod->init_size) { - module_free(mod, mod->module_core); - return -ENOMEM; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; - - /* Transfer each section which specifies SHF_ALLOC */ - pr_debug("final section addresses:\n"); - for (i = 0; i < info->hdr->e_shnum; i++) { - void *dest; - Elf_Shdr *shdr = &info->sechdrs[i]; - - if (!(shdr->sh_flags & SHF_ALLOC)) - continue; - - if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init - + (shdr->sh_entsize & ~INIT_OFFSET_MASK); - else - dest = mod->module_core + shdr->sh_entsize; - - if (shdr->sh_type != SHT_NOBITS) - memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); - /* Update sh_addr to point to copy in image. */ - shdr->sh_addr = (unsigned long)dest; - pr_debug("\t0x%lx %s\n", - (long)shdr->sh_addr, info->secstrings + shdr->sh_name); - } - - return 0; -} - -static int check_module_license_and_versions(struct module *mod) -{ - /* - * ndiswrapper is under GPL by itself, but loads proprietary modules. - * Don't use add_taint_module(), as it would prevent ndiswrapper from - * using GPL-only symbols it needs. - */ - if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); - - /* driverloader was caught wrongly pretending to be under GPL */ - if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - -#ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !mod->crcs) - || (mod->num_gpl_syms && !mod->gpl_crcs) - || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) -#ifdef CONFIG_UNUSED_SYMBOLS - || (mod->num_unused_syms && !mod->unused_crcs) - || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) -#endif - ) { - return try_to_force_load(mod, - "no versions for exported symbols"); - } -#endif - return 0; -} - -static void flush_module_icache(const struct module *mod) -{ - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Flush the instruction cache, since we've played with text. - * Do it before processing of module parameters, so the module - * can provide parameter accessor functions of its own. - */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); -} - -int __weak module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -static struct module *layout_and_allocate(struct load_info *info) -{ - /* Module within temporary copy. */ - struct module *mod; - Elf_Shdr *pcpusec; - int err; - - mod = setup_load_info(info); - if (IS_ERR(mod)) - return mod; - - err = check_modinfo(mod, info); - if (err) - return ERR_PTR(err); - - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(info->hdr, info->sechdrs, - info->secstrings, mod); - if (err < 0) - goto out; - - pcpusec = &info->sechdrs[info->index.pcpu]; - if (pcpusec->sh_size) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, - pcpusec->sh_size, pcpusec->sh_addralign); - if (err) - goto out; - pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; - } - - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, info); - layout_symtab(mod, info); - - /* Allocate and move to the final place */ - err = move_module(mod, info); - if (err) - goto free_percpu; - - /* Module has been copied to its final place now: return it. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; - kmemleak_load_module(mod, info); - return mod; - -free_percpu: - percpu_modfree(mod); -out: - return ERR_PTR(err); -} - -/* mod is no longer valid after this! */ -static void module_deallocate(struct module *mod, struct load_info *info) -{ - percpu_modfree(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); -} - -int __weak module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - -static int post_relocation(struct module *mod, const struct load_info *info) -{ - /* Sort exception table now relocations are done. */ - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Copy relocated percpu area over. */ - percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, - info->sechdrs[info->index.pcpu].sh_size); - - /* Setup kallsyms-specific fields. */ - add_kallsyms(mod, info); - - /* Arch-specific module finalizing. */ - return module_finalize(info->hdr, info->sechdrs, mod); -} - -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) -{ - struct load_info info = { NULL, }; - struct module *mod; - long err; - - pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); - - /* Copy in the blobs from userspace, check they are vaguely sane. */ - err = copy_and_check(&info, umod, len, uargs); - if (err) - return ERR_PTR(err); - - /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(&info); - if (IS_ERR(mod)) { - err = PTR_ERR(mod); - goto free_copy; - } - - /* Now module is in final location, initialize linked lists, etc. */ - err = module_unload_init(mod); - if (err) - goto free_module; - - /* Now we've got everything in the final locations, we can - * find optional sections. */ - find_module_sections(mod, &info); - - err = check_module_license_and_versions(mod); - if (err) - goto free_unload; - - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, &info); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, &info); - if (err < 0) - goto free_modinfo; - - err = apply_relocations(mod, &info); - if (err < 0) - goto free_modinfo; - - err = post_relocation(mod, &info); - if (err < 0) - goto free_modinfo; - - flush_module_icache(mod); - - /* Now copy in args */ - mod->args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(mod->args)) { - err = PTR_ERR(mod->args); - goto free_arch_cleanup; - } - - /* Mark state as coming so strong_try_module_get() ignores us. */ - mod->state = MODULE_STATE_COMING; - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. No one should access us, since - * strong_try_module_get() will fail. - * lockdep/oops can run asynchronous, so use the RCU list insertion - * function to insert in a way safe to concurrent readers. - * The mutex protects against concurrent writers. - */ - mutex_lock(&module_mutex); - if (find_module(mod->name)) { - err = -EEXIST; - goto unlock; - } - - /* This has to be done once we're sure module name is unique. */ - dynamic_debug_setup(info.debug, info.num_debug); - - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto ddebug; - - module_bug_finalize(info.hdr, info.sechdrs, mod); - list_add_rcu(&mod->list, &modules); - mutex_unlock(&module_mutex); - - /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); - if (err < 0) - goto unlink; - - /* Link in to syfs. */ - err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); - if (err < 0) - goto unlink; - - /* Get rid of temporary copy. */ - free_copy(&info); - - /* Done! */ - trace_module_load(mod); - return mod; - - unlink: - mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); - module_bug_cleanup(mod); - - ddebug: - dynamic_debug_remove(info.debug); - unlock: - mutex_unlock(&module_mutex); - synchronize_sched(); - kfree(mod->args); - free_arch_cleanup: - module_arch_cleanup(mod); - free_modinfo: - free_modinfo(mod); - free_unload: - module_unload_free(mod); - free_module: - module_deallocate(mod, &info); - free_copy: - free_copy(&info); - return ERR_PTR(err); -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif -} - -/* This is where the real work happens */ -SYSCALL_DEFINE3(init_module, void __user *, umod, - unsigned long, len, const char __user *, uargs) -{ - struct module *mod; - int ret = 0; - - /* Must have permission */ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; - - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) - return PTR_ERR(mod); - - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); - - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } - - /* Now it's a first class citizen! Wake up anyone waiting for it. */ - mod->state = MODULE_STATE_LIVE; - wake_up(&module_wq); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); - - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); - - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; -#endif - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); - - return 0; -} - -static inline int within(unsigned long addr, void *start, unsigned long size) -{ - return ((void *)addr >= start && (void *)addr < start + size); -} - -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - return str[0] == '$' && strchr("atd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *get_ksymbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval; - - /* At worse, next value is at end of module */ - if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; - else - nextval = (unsigned long)mod->module_core+mod->core_text_size; - - /* Scan for closest preceding symbol, and next symbol. (ELF - starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) - continue; - - /* We ignore unnamed symbols: they're uninformative - * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - mod->symtab[best].st_value; - if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; -} - -/* For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - char *namebuf) -{ - struct module *mod; - const char *ret = NULL; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { - if (modname) - *modname = mod->name; - ret = get_ksymbol(mod, addr, size, offset); - break; - } - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { - const char *sym; - - sym = get_ksymbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { - const char *sym; - - sym = get_ksymbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, *value, mod); - preempt_enable(); - return 0; - } - symnum -= mod->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -static unsigned long mod_find_symname(struct module *mod, const char *name) -{ - unsigned int i; - - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strchr(name, ':')) != NULL) { - *colon = '\0'; - if ((mod = find_module(name)) != NULL) - ret = mod_find_symname(mod, colon+1); - *colon = ':'; - } else { - list_for_each_entry_rcu(mod, &modules, list) - if ((ret = mod_find_symname(mod, name)) != 0) - break; - } - preempt_enable(); - return ret; -} - -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, - struct module *, unsigned long), - void *data) -{ - struct module *mod; - unsigned int i; - int ret; - - list_for_each_entry(mod, &modules, list) { - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); - if (ret != 0) - return ret; - } - } - return 0; -} -#endif /* CONFIG_KALLSYMS */ - -static char *module_flags(struct module *mod, char *buf) -{ - int bx = 0; - - if (mod->taints || - mod->state == MODULE_STATE_GOING || - mod->state == MODULE_STATE_COMING) { - buf[bx++] = '('; - bx += module_flags_taint(mod, buf + bx); - /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) - buf[bx++] = '-'; - /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) - buf[bx++] = '+'; - buf[bx++] = ')'; - } - buf[bx] = '\0'; - - return buf; -} - -#ifdef CONFIG_PROC_FS -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[8]; - - seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": - "Live"); - /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_printf(m, "\n"); - return 0; -} - -/* Format: modulename size refcount deps address - - Where refcount is a number or -, and deps is a comma-separated list - of depends or -. -*/ -static const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -static int modules_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &modules_op); -} - -static const struct file_operations proc_modules_operations = { - .open = modules_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init proc_modules_init(void) -{ - proc_create("modules", 0, NULL, &proc_modules_operations); - return 0; -} -module_init(proc_modules_init); -#endif - -/* Given an address, look for it in the module exception tables. */ -const struct exception_table_entry *search_module_extables(unsigned long addr) -{ - const struct exception_table_entry *e = NULL; - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->num_exentries == 0) - continue; - - e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, - addr); - if (e) - break; - } - preempt_enable(); - - /* Now, if we found one, we are running inside it now, hence - we cannot unload the module, hence no refcnt needed. */ - return e; -} - -/* - * is_module_address - is this address inside a module? - * @addr: the address to check. - * - * See is_module_text_address() if you simply want to see if the address - * is code (not data). - */ -bool is_module_address(unsigned long addr) -{ - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; -} - -/* - * __module_address - get the module which contains an address. - * @addr: the address. - * - * Must be called with preempt disabled or module mutex held so that - * module doesn't get freed during this. - */ -struct module *__module_address(unsigned long addr) -{ - struct module *mod; - - if (addr < module_addr_min || addr > module_addr_max) - return NULL; - - list_for_each_entry_rcu(mod, &modules, list) - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) - return mod; - return NULL; -} -EXPORT_SYMBOL_GPL(__module_address); - -/* - * is_module_text_address - is this address inside module code? - * @addr: the address to check. - * - * See is_module_address() if you simply want to see if the address is - * anywhere in a module. See kernel_text_address() for testing if an - * address corresponds to kernel or module code. - */ -bool is_module_text_address(unsigned long addr) -{ - bool ret; - - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); - - return ret; -} - -/* - * __module_text_address - get the module whose code contains an address. - * @addr: the address. - * - * Must be called with preempt disabled or module mutex held so that - * module doesn't get freed during this. - */ -struct module *__module_text_address(unsigned long addr) -{ - struct module *mod = __module_address(addr); - if (mod) { - /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) - mod = NULL; - } - return mod; -} -EXPORT_SYMBOL_GPL(__module_text_address); - -/* Don't grab lock, we're oopsing. */ -void print_modules(void) -{ - struct module *mod; - char buf[8]; - - printk(KERN_DEFAULT "Modules linked in:"); - /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) - printk(" %s%s", mod->name, module_flags(mod, buf)); - preempt_enable(); - if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); -} - -#ifdef CONFIG_MODVERSIONS -/* Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. */ -void module_layout(struct module *mod, - struct modversion_info *ver, - struct kernel_param *kp, - struct kernel_symbol *ks, - struct tracepoint * const *tp) -{ -} -EXPORT_SYMBOL(module_layout); -#endif -/* - * kernel/mutex-debug.c - * - * Debugging code for mutexes - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * lock debugging, locking tree, deadlock detection started by: - * - * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Released under the General Public License (GPL). - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mutex-debug.h" - -/* - * Must be called with lock->wait_lock held. - */ -void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ - memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); - waiter->magic = waiter; - INIT_LIST_HEAD(&waiter->list); -} - -void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); - DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); -} - -void debug_mutex_free_waiter(struct mutex_waiter *waiter) -{ - DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); -} - -void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - - /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; -} - -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; - - list_del_init(&waiter->list); - waiter->task = NULL; -} - -void debug_mutex_unlock(struct mutex *lock) -{ - if (unlikely(!debug_locks)) - return; - - DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); -} - -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->magic = lock; -} - -/*** - * mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void mutex_destroy(struct mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); - lock->magic = NULL; -} - -EXPORT_SYMBOL_GPL(mutex_destroy); -/* - * kernel/mutex.c - * - * Mutexes: blocking mutual exclusion locks - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and - * David Howells for suggestions and improvements. - * - * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline - * from the -rt tree, where it was originally implemented for rtmutexes - * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale - * and Sven Dietrich. - * - * Also see Documentation/mutex-design.txt. - */ -#include -#include -#include -#include -#include -#include - -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ -#ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" -# include -#else -# include "mutex.h" -# include -#endif - -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -{ - atomic_set(&lock->count, 1); - spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); - - debug_mutex_init(lock, name, key); -} - -EXPORT_SYMBOL(__mutex_init); - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * We split the mutex lock/unlock logic into separate fastpath and - * slowpath functions, to reduce the register pressure on the fastpath. - * We also put the fastpath first in the kernel image, to make sure the - * branch is predicted by the CPU as default-untaken. - */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); - -/** - * mutex_lock - acquire the mutex - * @lock: the mutex to be acquired - * - * Lock the mutex exclusively for this task. If the mutex is not - * available right now, it will sleep until it can get it. - * - * The mutex must later on be released by the same task that - * acquired it. Recursive locking is not allowed. The task - * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with - * the mutex still locked. The mutex must first be initialized - * (or statically defined) before it can be locked. memset()-ing - * the mutex to 0 is not allowed. - * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) - * - * This function is similar to (but not equivalent to) down(). - */ -void __sched mutex_lock(struct mutex *lock) -{ - might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} - -EXPORT_SYMBOL(mutex_lock); -#endif - -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); - -/** - * mutex_unlock - release the mutex - * @lock: the mutex to be released - * - * Unlock a mutex that has been locked by this task previously. - * - * This function must not be used in interrupt context. Unlocking - * of a not locked mutex is not allowed. - * - * This function is similar to (but not equivalent to) up(). - */ -void __sched mutex_unlock(struct mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); -#endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); -} - -EXPORT_SYMBOL(mutex_unlock); - -/* - * Lock a mutex (possibly interruptible), slowpath: - */ -static inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip) -{ - struct task_struct *task = current; - struct mutex_waiter waiter; - unsigned long flags; - - preempt_disable(); - mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - /* - * Optimistic spinning. - * - * We try to spin for acquisition when we find that there are no - * pending waiters and the lock owner is currently running on a - * (different) CPU. - * - * The rationale is that if the lock owner is running, it is likely to - * release the lock soon. - * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - */ - - for (;;) { - struct task_struct *owner; - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { - lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); - preempt_enable(); - return 0; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - arch_mutex_cpu_relax(); - } -#endif - spin_lock_mutex(&lock->wait_lock, flags); - - debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); - - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; - - if (atomic_xchg(&lock->count, -1) == 1) - goto done; - - lock_contended(&lock->dep_map, ip); - - for (;;) { - /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters: - */ - if (atomic_xchg(&lock->count, -1) == 1) - break; - - /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) - */ - if (unlikely(signal_pending_state(state, task))) { - mutex_remove_waiter(lock, &waiter, - task_thread_info(task)); - mutex_release(&lock->dep_map, 1, ip); - spin_unlock_mutex(&lock->wait_lock, flags); - - debug_mutex_free_waiter(&waiter); - preempt_enable(); - return -EINTR; - } - __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - spin_lock_mutex(&lock->wait_lock, flags); - } - -done: - lock_acquired(&lock->dep_map, ip); - /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, current_thread_info()); - mutex_set_owner(lock); - - /* set it to 0 if there are no waiters left: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - debug_mutex_free_waiter(&waiter); - preempt_enable(); - - return 0; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __sched -mutex_lock_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); -} - -EXPORT_SYMBOL_GPL(mutex_lock_nested); - -void __sched -_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) -{ - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); -} - -EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); - -int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); -} -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); - -int __sched -mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_); -} - -EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -#endif - -/* - * Release the lock, slowpath: - */ -static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - - /* - * some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); - - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); - - debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); - } - - spin_unlock_mutex(&lock->wait_lock, flags); -} - -/* - * Release the lock, slowpath: - */ -static __used noinline void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - __mutex_unlock_common_slowpath(lock_count, 1); -} - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * Here come the less common (and hence less performance-critical) APIs: - * mutex_lock_interruptible() and mutex_trylock(). - */ -static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); - -static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); - -/** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired - * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. - * - * This function is similar to (but not equivalent to) down_interruptible(). - */ -int __sched mutex_lock_interruptible(struct mutex *lock) -{ - int ret; - - might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); - if (!ret) - mutex_set_owner(lock); - - return ret; -} - -EXPORT_SYMBOL(mutex_lock_interruptible); - -int __sched mutex_lock_killable(struct mutex *lock) -{ - int ret; - - might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_killable_slowpath); - if (!ret) - mutex_set_owner(lock); - - return ret; -} -EXPORT_SYMBOL(mutex_lock_killable); - -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); -} - -static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); -} - -static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); -} -#endif - -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - -/** - * mutex_trylock - try to acquire the mutex, without waiting - * @lock: the mutex to be acquired - * - * Try to acquire the mutex atomically. Returns 1 if the mutex - * has been acquired successfully, and 0 on contention. - * - * NOTE: this function follows the spin_trylock() convention, so - * it is negated from the down_trylock() return values! Be careful - * about this when converting semaphore users to mutexes. - * - * This function must not be used in interrupt context. The - * mutex must be released by the same task that acquired it. - */ -int __sched mutex_trylock(struct mutex *lock) -{ - int ret; - - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); - - return ret; -} -EXPORT_SYMBOL(mutex_trylock); - -/** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec - * @lock: the mutex to return holding if we dec to 0 - * - * return true and hold lock if we dec to 0, return false otherwise - */ -int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) -{ - /* dec if we can't possibly hit 0 */ - if (atomic_add_unless(cnt, -1, 1)) - return 0; - /* we might hit 0, so take the lock */ - mutex_lock(lock); - if (!atomic_dec_and_test(cnt)) { - /* when we actually did the dec, we didn't hit 0 */ - mutex_unlock(lock); - return 0; - } - /* we hit 0, and we hold the lock */ - return 1; -} -EXPORT_SYMBOL(atomic_dec_and_mutex_lock); -#include -#include -#include -#include -#include -#include -#include - -/* - * Notifier list for kernel code which wants to be called - * at shutdown. This is used to stop any idling DMA operations - * and the like. - */ -BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); - -/* - * Notifier chain core routines. The exported routines below - * are layered on top of these, with appropriate locking added. - */ - -static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - -static int notifier_chain_cond_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) - return 0; - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - -static int notifier_chain_unregister(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) { - rcu_assign_pointer(*nl, n->next); - return 0; - } - nl = &((*nl)->next); - } - return -ENOENT; -} - -/** - * notifier_call_chain - Informs the registered notifiers about an event. - * @nl: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: Number of notifier functions to be called. Don't care - * value of this parameter is -1. - * @nr_calls: Records the number of notifications sent. Don't care - * value of this field is NULL. - * @returns: notifier_call_chain returns the value returned by the - * last notifier function called. - */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - struct notifier_block *nb, *next_nb; - - nb = rcu_dereference_raw(*nl); - - while (nb && nr_to_call) { - next_nb = rcu_dereference_raw(nb->next); - -#ifdef CONFIG_DEBUG_NOTIFIERS - if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { - WARN(1, "Invalid notifier called!"); - nb = next_nb; - continue; - } -#endif - ret = nb->notifier_call(nb, val, v); - - if (nr_calls) - (*nr_calls)++; - - if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) - break; - nb = next_nb; - nr_to_call--; - } - return ret; -} - -/* - * Atomic notifier chain routines. Registration and unregistration - * use a spinlock, and call_chain is synchronized by RCU (no locks). - */ - -/** - * atomic_notifier_chain_register - Add notifier to an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an atomic notifier chain. - * - * Currently always returns zero. - */ -int atomic_notifier_chain_register(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); - -/** - * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an atomic notifier chain. - * - * Returns zero on success or %-ENOENT on failure. - */ -int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, - struct notifier_block *n) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_unregister(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); - synchronize_rcu(); - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); - -/** - * __atomic_notifier_call_chain - Call functions in an atomic notifier chain - * @nh: Pointer to head of the atomic notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See the comment for notifier_call_chain. - * @nr_calls: See the comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in an atomic context, so they must not block. - * This routine uses RCU to synchronize with changes to the chain. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - - rcu_read_lock(); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); - -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) -{ - return __atomic_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); - -/* - * Blocking notifier chain routines. All access to the chain is - * synchronized by an rwsem. - */ - -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); - -/** - * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain, only if not already - * present in the chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - down_write(&nh->rwsem); - ret = notifier_chain_cond_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); - -/** - * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a blocking notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call down_write(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - down_write(&nh->rwsem); - ret = notifier_chain_unregister(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); - -/** - * __blocking_notifier_call_chain - Call functions in a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain. - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret = NOTIFY_DONE; - - /* - * We check the head outside the lock, but if this access is - * racy then it does not matter what the result of the test - * is, we re-check the list after having taken the lock anyway: - */ - if (rcu_dereference_raw(nh->head)) { - down_read(&nh->rwsem); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, - nr_calls); - up_read(&nh->rwsem); - } - return ret; -} -EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); - -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, - unsigned long val, void *v) -{ - return __blocking_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); - -/* - * Raw notifier chain routines. There is no protection; - * the caller must provide it. Use at your own risk! - */ - -/** - * raw_notifier_chain_register - Add notifier to a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a raw notifier chain. - * All locking must be provided by the caller. - * - * Currently always returns zero. - */ -int raw_notifier_chain_register(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_register(&nh->head, n); -} -EXPORT_SYMBOL_GPL(raw_notifier_chain_register); - -/** - * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from a raw notifier chain. - * All locking must be provided by the caller. - * - * Returns zero on success or %-ENOENT on failure. - */ -int raw_notifier_chain_unregister(struct raw_notifier_head *nh, - struct notifier_block *n) -{ - return notifier_chain_unregister(&nh->head, n); -} -EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); - -/** - * __raw_notifier_call_chain - Call functions in a raw notifier chain - * @nh: Pointer to head of the raw notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in an undefined context. - * All locking must be provided by the caller. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -} -EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); - -int raw_notifier_call_chain(struct raw_notifier_head *nh, - unsigned long val, void *v) -{ - return __raw_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(raw_notifier_call_chain); - -/* - * SRCU notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by SRCU (no locks). - */ - -/** - * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to an SRCU notifier chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int srcu_notifier_chain_register(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); - mutex_unlock(&nh->mutex); - return ret; -} -EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); - -/** - * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @n: Entry to remove from notifier chain - * - * Removes a notifier from an SRCU notifier chain. - * Must be called from process context. - * - * Returns zero on success or %-ENOENT on failure. - */ -int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - /* - * This code gets used during boot-up, when task switching is - * not yet working and interrupts must remain disabled. At - * such times we must not call mutex_lock(). - */ - if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_unregister(&nh->head, n); - - mutex_lock(&nh->mutex); - ret = notifier_chain_unregister(&nh->head, n); - mutex_unlock(&nh->mutex); - synchronize_srcu(&nh->srcu); - return ret; -} -EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); - -/** - * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain - * @nh: Pointer to head of the SRCU notifier chain - * @val: Value passed unmodified to notifier function - * @v: Pointer passed unmodified to notifier function - * @nr_to_call: See comment for notifier_call_chain. - * @nr_calls: See comment for notifier_call_chain - * - * Calls each function in a notifier chain in turn. The functions - * run in a process context, so they are allowed to block. - * - * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() - * will return immediately, with the return value of - * the notifier function which halted execution. - * Otherwise the return value is the return value - * of the last notifier function called. - */ -int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) -{ - int ret; - int idx; - - idx = srcu_read_lock(&nh->srcu); - ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - srcu_read_unlock(&nh->srcu, idx); - return ret; -} -EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); - -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, - unsigned long val, void *v) -{ - return __srcu_notifier_call_chain(nh, val, v, -1, NULL); -} -EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); - -/** - * srcu_init_notifier_head - Initialize an SRCU notifier head - * @nh: Pointer to head of the srcu notifier chain - * - * Unlike other sorts of notifier heads, SRCU notifier heads require - * dynamic initialization. Be sure to call this routine before - * calling any of the other SRCU notifier routines for this head. - * - * If an SRCU notifier head is deallocated, it must first be cleaned - * up by calling srcu_cleanup_notifier_head(). Otherwise the head's - * per-cpu data (used by the SRCU mechanism) will leak. - */ -void srcu_init_notifier_head(struct srcu_notifier_head *nh) -{ - mutex_init(&nh->mutex); - if (init_srcu_struct(&nh->srcu) < 0) - BUG(); - nh->head = NULL; -} -EXPORT_SYMBOL_GPL(srcu_init_notifier_head); - -static ATOMIC_NOTIFIER_HEAD(die_chain); - -int notrace __kprobes notify_die(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - struct die_args args = { - .regs = regs, - .str = str, - .err = err, - .trapnr = trap, - .signr = sig, - - }; - return atomic_notifier_call_chain(&die_chain, val, &args); -} - -int register_die_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(register_die_notifier); - -int unregister_die_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&die_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_die_notifier); -/* - * Copyright (C) 2006 IBM Corporation - * - * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - * Jun 2006 - namespaces support - * OpenVZ, SWsoft Inc. - * Pavel Emelianov - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *nsproxy_cachep; - -struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), - .uts_ns = &init_uts_ns, -#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) - .ipc_ns = &init_ipc_ns, -#endif - .mnt_ns = NULL, - .pid_ns = &init_pid_ns, -#ifdef CONFIG_NET - .net_ns = &init_net, -#endif -}; - -static inline struct nsproxy *create_nsproxy(void) -{ - struct nsproxy *nsproxy; - - nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); - if (nsproxy) - atomic_set(&nsproxy->count, 1); - return nsproxy; -} - -/* - * Create new nsproxy and all of its the associated namespaces. - * Return the newly created nsproxy. Do not attach this to the task, - * leave it to the caller to do proper locking and attach it to task. - */ -static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) -{ - struct nsproxy *new_nsp; - int err; - - new_nsp = create_nsproxy(); - if (!new_nsp) - return ERR_PTR(-ENOMEM); - - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); - if (IS_ERR(new_nsp->mnt_ns)) { - err = PTR_ERR(new_nsp->mnt_ns); - goto out_ns; - } - - new_nsp->uts_ns = copy_utsname(flags, tsk); - if (IS_ERR(new_nsp->uts_ns)) { - err = PTR_ERR(new_nsp->uts_ns); - goto out_uts; - } - - new_nsp->ipc_ns = copy_ipcs(flags, tsk); - if (IS_ERR(new_nsp->ipc_ns)) { - err = PTR_ERR(new_nsp->ipc_ns); - goto out_ipc; - } - - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); - if (IS_ERR(new_nsp->pid_ns)) { - err = PTR_ERR(new_nsp->pid_ns); - goto out_pid; - } - - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); - if (IS_ERR(new_nsp->net_ns)) { - err = PTR_ERR(new_nsp->net_ns); - goto out_net; - } - - return new_nsp; - -out_net: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); -out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); -out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); -out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); -out_ns: - kmem_cache_free(nsproxy_cachep, new_nsp); - return ERR_PTR(err); -} - -/* - * called from clone. This now handles copy for nsproxy and all - * namespaces therein. - */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) -{ - struct nsproxy *old_ns = tsk->nsproxy; - struct nsproxy *new_ns; - int err = 0; - - if (!old_ns) - return 0; - - get_nsproxy(old_ns); - - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET))) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } - - /* - * CLONE_NEWIPC must detach from the undolist: after switching - * to a new ipc namespace, the semaphore arrays from the old - * namespace are unreachable. In clone parlance, CLONE_SYSVSEM - * means share undolist with parent, so we must forbid using - * it along with CLONE_NEWIPC. - */ - if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { - err = -EINVAL; - goto out; - } - - new_ns = create_new_namespaces(flags, tsk, tsk->fs); - if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto out; - } - - tsk->nsproxy = new_ns; - -out: - put_nsproxy(old_ns); - return err; -} - -void free_nsproxy(struct nsproxy *ns) -{ - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - -/* - * Called from unshare. Unshare all the namespaces part of nsproxy. - * On success, returns the new nsproxy. - */ -int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) -{ - int err = 0; - - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) - return 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); - if (IS_ERR(*new_nsp)) { - err = PTR_ERR(*new_nsp); - goto out; - } - -out: - return err; -} - -void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) -{ - struct nsproxy *ns; - - might_sleep(); - - ns = p->nsproxy; - - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); - free_nsproxy(ns); - } -} - -void exit_task_namespaces(struct task_struct *p) -{ - switch_task_namespaces(p, NULL); -} - -SYSCALL_DEFINE2(setns, int, fd, int, nstype) -{ - const struct proc_ns_operations *ops; - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; - struct proc_inode *ei; - struct file *file; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); - - err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) - goto out; - - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); - goto out; - } - - err = ops->install(new_nsproxy, ei->ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; - } - switch_task_namespaces(tsk, new_nsproxy); -out: - fput(file); - return err; -} - -int __init nsproxy_cache_init(void) -{ - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); - return 0; -} -/* - * padata.c - generic interface to process data streams in parallel - * - * Copyright (C) 2008, 2009 secunet Security Networks AG - * Copyright (C) 2008, 2009 Steffen Klassert - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_SEQ_NR (INT_MAX - NR_CPUS) -#define MAX_OBJ_NUM 1000 - -static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) -{ - int cpu, target_cpu; - - target_cpu = cpumask_first(pd->cpumask.pcpu); - for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); - - return target_cpu; -} - -static int padata_cpu_hash(struct padata_priv *padata) -{ - int cpu_index; - struct parallel_data *pd; - - pd = padata->pd; - - /* - * Hash the sequence numbers to the cpus by taking - * seq_nr mod. number of cpus in use. - */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); - - return padata_index_to_cpu(pd, cpu_index); -} - -static void padata_parallel_worker(struct work_struct *parallel_work) -{ - struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; - LIST_HEAD(local_list); - - local_bh_disable(); - pqueue = container_of(parallel_work, - struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; - - spin_lock(&pqueue->parallel.lock); - list_replace_init(&pqueue->parallel.list, &local_list); - spin_unlock(&pqueue->parallel.lock); - - while (!list_empty(&local_list)) { - struct padata_priv *padata; - - padata = list_entry(local_list.next, - struct padata_priv, list); - - list_del_init(&padata->list); - - padata->parallel(padata); - } - - local_bh_enable(); -} - -/** - * padata_do_parallel - padata parallelization function - * - * @pinst: padata instance - * @padata: object to be parallelized - * @cb_cpu: cpu the serialization callback function will run on, - * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). - * - * The parallelization callback function will run with BHs off. - * Note: Every object which is parallelized by padata_do_parallel - * must be seen by padata_do_serial. - */ -int padata_do_parallel(struct padata_instance *pinst, - struct padata_priv *padata, int cb_cpu) -{ - int target_cpu, err; - struct padata_parallel_queue *queue; - struct parallel_data *pd; - - rcu_read_lock_bh(); - - pd = rcu_dereference(pinst->pd); - - err = -EINVAL; - if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) - goto out; - - if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) - goto out; - - err = -EBUSY; - if ((pinst->flags & PADATA_RESET)) - goto out; - - if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) - goto out; - - err = 0; - atomic_inc(&pd->refcnt); - padata->pd = pd; - padata->cb_cpu = cb_cpu; - - if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) - atomic_set(&pd->seq_nr, -1); - - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - - target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->pqueue, target_cpu); - - spin_lock(&queue->parallel.lock); - list_add_tail(&padata->list, &queue->parallel.list); - spin_unlock(&queue->parallel.lock); - - queue_work_on(target_cpu, pinst->wq, &queue->work); - -out: - rcu_read_unlock_bh(); - - return err; -} -EXPORT_SYMBOL(padata_do_parallel); - -/* - * padata_get_next - Get the next object that needs serialization. - * - * Return values are: - * - * A pointer to the control struct of the next object that needs - * serialization, if present in one of the percpu reorder queues. - * - * NULL, if all percpu reorder queues are empty. - * - * -EINPROGRESS, if the next object that needs serialization will - * be parallel processed by another cpu and is not yet present in - * the cpu's reorder queue. - * - * -ENODATA, if this cpu has to do the parallel processing for - * the next object. - */ -static struct padata_priv *padata_get_next(struct parallel_data *pd) -{ - int cpu, num_cpus; - int next_nr, next_index; - struct padata_parallel_queue *queue, *next_queue; - struct padata_priv *padata; - struct padata_list *reorder; - - num_cpus = cpumask_weight(pd->cpumask.pcpu); - - /* - * Calculate the percpu reorder queue and the sequence - * number of the next object. - */ - next_nr = pd->processed; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); - next_queue = per_cpu_ptr(pd->pqueue, cpu); - - if (unlikely(next_nr > pd->max_seq_nr)) { - next_nr = next_nr - pd->max_seq_nr - 1; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); - next_queue = per_cpu_ptr(pd->pqueue, cpu); - pd->processed = 0; - } - - padata = NULL; - - reorder = &next_queue->reorder; - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - BUG_ON(next_nr != padata->seq_nr); - - spin_lock(&reorder->lock); - list_del_init(&padata->list); - atomic_dec(&pd->reorder_objects); - spin_unlock(&reorder->lock); - - pd->processed++; - - goto out; - } - - queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); - if (queue->cpu_index == next_queue->cpu_index) { - padata = ERR_PTR(-ENODATA); - goto out; - } - - padata = ERR_PTR(-EINPROGRESS); -out: - return padata; -} - -static void padata_reorder(struct parallel_data *pd) -{ - struct padata_priv *padata; - struct padata_serial_queue *squeue; - struct padata_instance *pinst = pd->pinst; - - /* - * We need to ensure that only one cpu can work on dequeueing of - * the reorder queue the time. Calculating in which percpu reorder - * queue the next object will arrive takes some time. A spinlock - * would be highly contended. Also it is not clear in which order - * the objects arrive to the reorder queues. So a cpu could wait to - * get the lock just to notice that there is nothing to do at the - * moment. Therefore we use a trylock and let the holder of the lock - * care for all the objects enqueued during the holdtime of the lock. - */ - if (!spin_trylock_bh(&pd->lock)) - return; - - while (1) { - padata = padata_get_next(pd); - - /* - * All reorder queues are empty, or the next object that needs - * serialization is parallel processed by another cpu and is - * still on it's way to the cpu's reorder queue, nothing to - * do for now. - */ - if (!padata || PTR_ERR(padata) == -EINPROGRESS) - break; - - /* - * This cpu has to do the parallel processing of the next - * object. It's waiting in the cpu's parallelization queue, - * so exit immediately. - */ - if (PTR_ERR(padata) == -ENODATA) { - del_timer(&pd->timer); - spin_unlock_bh(&pd->lock); - return; - } - - squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - - spin_lock(&squeue->serial.lock); - list_add_tail(&padata->list, &squeue->serial.list); - spin_unlock(&squeue->serial.lock); - - queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); - } - - spin_unlock_bh(&pd->lock); - - /* - * The next object that needs serialization might have arrived to - * the reorder queues in the meantime, we will be called again - * from the timer function if no one else cares for it. - */ - if (atomic_read(&pd->reorder_objects) - && !(pinst->flags & PADATA_RESET)) - mod_timer(&pd->timer, jiffies + HZ); - else - del_timer(&pd->timer); - - return; -} - -static void padata_reorder_timer(unsigned long arg) -{ - struct parallel_data *pd = (struct parallel_data *)arg; - - padata_reorder(pd); -} - -static void padata_serial_worker(struct work_struct *serial_work) -{ - struct padata_serial_queue *squeue; - struct parallel_data *pd; - LIST_HEAD(local_list); - - local_bh_disable(); - squeue = container_of(serial_work, struct padata_serial_queue, work); - pd = squeue->pd; - - spin_lock(&squeue->serial.lock); - list_replace_init(&squeue->serial.list, &local_list); - spin_unlock(&squeue->serial.lock); - - while (!list_empty(&local_list)) { - struct padata_priv *padata; - - padata = list_entry(local_list.next, - struct padata_priv, list); - - list_del_init(&padata->list); - - padata->serial(padata); - atomic_dec(&pd->refcnt); - } - local_bh_enable(); -} - -/** - * padata_do_serial - padata serialization function - * - * @padata: object to be serialized. - * - * padata_do_serial must be called for every parallelized object. - * The serialization callback function will run with BHs off. - */ -void padata_do_serial(struct padata_priv *padata) -{ - int cpu; - struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - - pd = padata->pd; - - cpu = get_cpu(); - pqueue = per_cpu_ptr(pd->pqueue, cpu); - - spin_lock(&pqueue->reorder.lock); - atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &pqueue->reorder.list); - spin_unlock(&pqueue->reorder.lock); - - put_cpu(); - - padata_reorder(pd); -} -EXPORT_SYMBOL(padata_do_serial); - -static int padata_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - return -ENOMEM; - - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.cbcpu); - return -ENOMEM; - } - - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); - return 0; -} - -static void __padata_list_init(struct padata_list *pd_list) -{ - INIT_LIST_HEAD(&pd_list->list); - spin_lock_init(&pd_list->lock); -} - -/* Initialize all percpu queues used by serial workers */ -static void padata_init_squeues(struct parallel_data *pd) -{ - int cpu; - struct padata_serial_queue *squeue; - - for_each_cpu(cpu, pd->cpumask.cbcpu) { - squeue = per_cpu_ptr(pd->squeue, cpu); - squeue->pd = pd; - __padata_list_init(&squeue->serial); - INIT_WORK(&squeue->work, padata_serial_worker); - } -} - -/* Initialize all percpu queues used by parallel workers */ -static void padata_init_pqueues(struct parallel_data *pd) -{ - int cpu_index, num_cpus, cpu; - struct padata_parallel_queue *pqueue; - - cpu_index = 0; - for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - pqueue->pd = pd; - pqueue->cpu_index = cpu_index; - cpu_index++; - - __padata_list_init(&pqueue->reorder); - __padata_list_init(&pqueue->parallel); - INIT_WORK(&pqueue->work, padata_parallel_worker); - atomic_set(&pqueue->num_obj, 0); - } - - num_cpus = cpumask_weight(pd->cpumask.pcpu); - pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; -} - -/* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - struct parallel_data *pd; - - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; - - pd->pqueue = alloc_percpu(struct padata_parallel_queue); - if (!pd->pqueue) - goto err_free_pd; - - pd->squeue = alloc_percpu(struct padata_serial_queue); - if (!pd->squeue) - goto err_free_pqueue; - if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) - goto err_free_squeue; - - padata_init_pqueues(pd); - padata_init_squeues(pd); - setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - atomic_set(&pd->seq_nr, -1); - atomic_set(&pd->reorder_objects, 0); - atomic_set(&pd->refcnt, 0); - pd->pinst = pinst; - spin_lock_init(&pd->lock); - - return pd; - -err_free_squeue: - free_percpu(pd->squeue); -err_free_pqueue: - free_percpu(pd->pqueue); -err_free_pd: - kfree(pd); -err: - return NULL; -} - -static void padata_free_pd(struct parallel_data *pd) -{ - free_cpumask_var(pd->cpumask.pcpu); - free_cpumask_var(pd->cpumask.cbcpu); - free_percpu(pd->pqueue); - free_percpu(pd->squeue); - kfree(pd); -} - -/* Flush all objects out of the padata queues. */ -static void padata_flush_queues(struct parallel_data *pd) -{ - int cpu; - struct padata_parallel_queue *pqueue; - struct padata_serial_queue *squeue; - - for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - flush_work(&pqueue->work); - } - - del_timer_sync(&pd->timer); - - if (atomic_read(&pd->reorder_objects)) - padata_reorder(pd); - - for_each_cpu(cpu, pd->cpumask.cbcpu) { - squeue = per_cpu_ptr(pd->squeue, cpu); - flush_work(&squeue->work); - } - - BUG_ON(atomic_read(&pd->refcnt) != 0); -} - -static void __padata_start(struct padata_instance *pinst) -{ - pinst->flags |= PADATA_INIT; -} - -static void __padata_stop(struct padata_instance *pinst) -{ - if (!(pinst->flags & PADATA_INIT)) - return; - - pinst->flags &= ~PADATA_INIT; - - synchronize_rcu(); - - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); -} - -/* Replace the internal control structure with a new one. */ -static void padata_replace(struct padata_instance *pinst, - struct parallel_data *pd_new) -{ - struct parallel_data *pd_old = pinst->pd; - int notification_mask = 0; - - pinst->flags |= PADATA_RESET; - - rcu_assign_pointer(pinst->pd, pd_new); - - synchronize_rcu(); - - if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) - notification_mask |= PADATA_CPU_PARALLEL; - if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) - notification_mask |= PADATA_CPU_SERIAL; - - padata_flush_queues(pd_old); - padata_free_pd(pd_old); - - if (notification_mask) - blocking_notifier_call_chain(&pinst->cpumask_change_notifier, - notification_mask, - &pd_new->cpumask); - - pinst->flags &= ~PADATA_RESET; -} - -/** - * padata_register_cpumask_notifier - Registers a notifier that will be called - * if either pcpu or cbcpu or both cpumasks change. - * - * @pinst: A poineter to padata instance - * @nblock: A pointer to notifier block. - */ -int padata_register_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_register_cpumask_notifier); - -/** - * padata_unregister_cpumask_notifier - Unregisters cpumask notifier - * registered earlier using padata_register_cpumask_notifier - * - * @pinst: A pointer to data instance. - * @nlock: A pointer to notifier block. - */ -int padata_unregister_cpumask_notifier(struct padata_instance *pinst, - struct notifier_block *nblock) -{ - return blocking_notifier_chain_unregister( - &pinst->cpumask_change_notifier, - nblock); -} -EXPORT_SYMBOL(padata_unregister_cpumask_notifier); - - -/* If cpumask contains no active cpu, we mark the instance as invalid. */ -static bool padata_validate_cpumask(struct padata_instance *pinst, - const struct cpumask *cpumask) -{ - if (!cpumask_intersects(cpumask, cpu_active_mask)) { - pinst->flags |= PADATA_INVALID; - return false; - } - - pinst->flags &= ~PADATA_INVALID; - return true; -} - -static int __padata_set_cpumasks(struct padata_instance *pinst, - cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int valid; - struct parallel_data *pd; - - valid = padata_validate_cpumask(pinst, pcpumask); - if (!valid) { - __padata_stop(pinst); - goto out_replace; - } - - valid = padata_validate_cpumask(pinst, cbcpumask); - if (!valid) - __padata_stop(pinst); - -out_replace: - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) - return -ENOMEM; - - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - - padata_replace(pinst, pd); - - if (valid) - __padata_start(pinst); - - return 0; -} - -/** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - -/** - * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value - * equivalent to @cpumask. - * - * @pinst: padata instance - * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding - * to parallel and serial cpumasks respectively. - * @cpumask: the cpumask to use - */ -int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, - cpumask_var_t cpumask) -{ - struct cpumask *serial_mask, *parallel_mask; - int err = -EINVAL; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - switch (cpumask_type) { - case PADATA_CPU_PARALLEL: - serial_mask = pinst->cpumask.cbcpu; - parallel_mask = cpumask; - break; - case PADATA_CPU_SERIAL: - parallel_mask = pinst->cpumask.pcpu; - serial_mask = cpumask; - break; - default: - goto out; - } - - err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); - -out: - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_set_cpumask); - -static int __padata_add_cpu(struct padata_instance *pinst, int cpu) -{ - struct parallel_data *pd; - - if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); - - if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && - padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) - __padata_start(pinst); - } - - return 0; -} - - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - -static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) -{ - struct parallel_data *pd = NULL; - - if (cpumask_test_cpu(cpu, cpu_online_mask)) { - - if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || - !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) - __padata_stop(pinst); - - pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, - pinst->cpumask.cbcpu); - if (!pd) - return -ENOMEM; - - padata_replace(pinst, pd); - } - - return 0; -} - - /** - * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to remove - * @mask: bitmask specifying from which cpumask @cpu should be removed - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_remove_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_remove_cpu); - -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - -static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) -{ - return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || - cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); -} - - -static int padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int err; - struct padata_instance *pinst; - int cpu = (unsigned long)hcpu; - - pinst = container_of(nfb, struct padata_instance, cpu_notifier); - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - } - - return NOTIFY_OK; -} -#endif - -static void __padata_free(struct padata_instance *pinst) -{ -#ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); -#endif - - padata_stop(pinst); - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask.pcpu); - free_cpumask_var(pinst->cpumask.cbcpu); - kfree(pinst); -} - -#define kobj2pinst(_kobj) \ - container_of(_kobj, struct padata_instance, kobj) -#define attr2pentry(_attr) \ - container_of(_attr, struct padata_sysfs_entry, attr) - -static void padata_sysfs_release(struct kobject *kobj) -{ - struct padata_instance *pinst = kobj2pinst(kobj); - __padata_free(pinst); -} - -struct padata_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct padata_instance *, struct attribute *, char *); - ssize_t (*store)(struct padata_instance *, struct attribute *, - const char *, size_t); -}; - -static ssize_t show_cpumask(struct padata_instance *pinst, - struct attribute *attr, char *buf) -{ - struct cpumask *cpumask; - ssize_t len; - - mutex_lock(&pinst->lock); - if (!strcmp(attr->name, "serial_cpumask")) - cpumask = pinst->cpumask.cbcpu; - else - cpumask = pinst->cpumask.pcpu; - - len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), - nr_cpu_ids); - if (PAGE_SIZE - len < 2) - len = -EINVAL; - else - len += sprintf(buf + len, "\n"); - - mutex_unlock(&pinst->lock); - return len; -} - -static ssize_t store_cpumask(struct padata_instance *pinst, - struct attribute *attr, - const char *buf, size_t count) -{ - cpumask_var_t new_cpumask; - ssize_t ret; - int mask_type; - - if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) - return -ENOMEM; - - ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), - nr_cpumask_bits); - if (ret < 0) - goto out; - - mask_type = !strcmp(attr->name, "serial_cpumask") ? - PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; - ret = padata_set_cpumask(pinst, mask_type, new_cpumask); - if (!ret) - ret = count; - -out: - free_cpumask_var(new_cpumask); - return ret; -} - -#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ - static struct padata_sysfs_entry _name##_attr = \ - __ATTR(_name, 0644, _show_name, _store_name) -#define PADATA_ATTR_RO(_name, _show_name) \ - static struct padata_sysfs_entry _name##_attr = \ - __ATTR(_name, 0400, _show_name, NULL) - -PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); -PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); - -/* - * Padata sysfs provides the following objects: - * serial_cpumask [RW] - cpumask for serial workers - * parallel_cpumask [RW] - cpumask for parallel workers - */ -static struct attribute *padata_default_attrs[] = { - &serial_cpumask_attr.attr, - ¶llel_cpumask_attr.attr, - NULL, -}; - -static ssize_t padata_sysfs_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct padata_instance *pinst; - struct padata_sysfs_entry *pentry; - ssize_t ret = -EIO; - - pinst = kobj2pinst(kobj); - pentry = attr2pentry(attr); - if (pentry->show) - ret = pentry->show(pinst, attr, buf); - - return ret; -} - -static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct padata_instance *pinst; - struct padata_sysfs_entry *pentry; - ssize_t ret = -EIO; - - pinst = kobj2pinst(kobj); - pentry = attr2pentry(attr); - if (pentry->show) - ret = pentry->store(pinst, attr, buf, count); - - return ret; -} - -static const struct sysfs_ops padata_sysfs_ops = { - .show = padata_sysfs_show, - .store = padata_sysfs_store, -}; - -static struct kobj_type padata_attr_type = { - .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, - .release = padata_sysfs_release, -}; - -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - -/** - * padata_alloc - allocate and initialize a padata instance and specify - * cpumasks for serial and parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - * @pcpumask: cpumask that will be used for padata parallelization - * @cbcpumask: cpumask that will be used for padata serialization - */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - struct padata_instance *pinst; - struct parallel_data *pd = NULL; - - pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); - if (!pinst) - goto err; - - get_online_cpus(); - if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) - goto err_free_inst; - if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pinst->cpumask.pcpu); - goto err_free_inst; - } - if (!padata_validate_cpumask(pinst, pcpumask) || - !padata_validate_cpumask(pinst, cbcpumask)) - goto err_free_masks; - - pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); - if (!pd) - goto err_free_masks; - - rcu_assign_pointer(pinst->pd, pd); - - pinst->wq = wq; - - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - - pinst->flags = 0; - -#ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); -#endif - - put_online_cpus(); - - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); - kobject_init(&pinst->kobj, &padata_attr_type); - mutex_init(&pinst->lock); - - return pinst; - -err_free_masks: - free_cpumask_var(pinst->cpumask.pcpu); - free_cpumask_var(pinst->cpumask.cbcpu); -err_free_inst: - kfree(pinst); - put_online_cpus(); -err: - return NULL; -} -EXPORT_SYMBOL(padata_alloc); - -/** - * padata_free - free a padata instance - * - * @padata_inst: padata instance to free - */ -void padata_free(struct padata_instance *pinst) -{ - kobject_put(&pinst->kobj); -} -EXPORT_SYMBOL(padata_free); -/* - * linux/kernel/panic.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * This function is used through-out the kernel (including mm and fs) - * to indicate a major problem. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PANIC_TIMER_STEP 100 -#define PANIC_BLINK_SPD 18 - -int panic_on_oops; -static unsigned long tainted_mask; -static int pause_on_oops; -static int pause_on_oops_flag; -static DEFINE_SPINLOCK(pause_on_oops_lock); - -int panic_timeout; -EXPORT_SYMBOL_GPL(panic_timeout); - -ATOMIC_NOTIFIER_HEAD(panic_notifier_list); - -EXPORT_SYMBOL(panic_notifier_list); - -static long no_blink(int state) -{ - return 0; -} - -/* Returns how long it waited in ms */ -long (*panic_blink)(int state); -EXPORT_SYMBOL(panic_blink); - -/* - * Stop ourself in panic -- architecture code may override this - */ -void __weak panic_smp_self_stop(void) -{ - while (1) - cpu_relax(); -} - -/** - * panic - halt the system - * @fmt: The text string to print - * - * Display a message, then perform cleanups. - * - * This function never returns. - */ -void panic(const char *fmt, ...) -{ - static DEFINE_SPINLOCK(panic_lock); - static char buf[1024]; - va_list args; - long i, i_next = 0; - int state = 0; - - /* - * It's possible to come here directly from a panic-assertion and - * not have preempt disabled. Some functions called from here want - * preempt to be disabled. No point enabling it later though... - * - * Only one CPU is allowed to execute the panic code from here. For - * multiple parallel invocations of panic, all other CPUs either - * stop themself or will wait until they are stopped by the 1st CPU - * with smp_send_stop(). - */ - if (!spin_trylock(&panic_lock)) - panic_smp_self_stop(); - - console_verbose(); - bust_spinlocks(1); - va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE - /* - * Avoid nested stack-dumping if a panic occurs during oops processing - */ - if (!oops_in_progress) - dump_stack(); -#endif - - /* - * If we have crashed and we have a crash kernel loaded let it handle - * everything else. - * Do we want to call this before we try to display a message? - */ - crash_kexec(NULL); - - kmsg_dump(KMSG_DUMP_PANIC); - - /* - * Note smp_send_stop is the usual smp shutdown function, which - * unfortunately means it may not be hardened to work in a panic - * situation. - */ - smp_send_stop(); - - atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - - bust_spinlocks(0); - - if (!panic_blink) - panic_blink = no_blink; - - if (panic_timeout > 0) { - /* - * Delay timeout seconds before rebooting the machine. - * We can't use the "normal" timers since we just panicked. - */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); - - for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { - touch_nmi_watchdog(); - if (i >= i_next) { - i += panic_blink(state ^= 1); - i_next = i + 3600 / PANIC_BLINK_SPD; - } - mdelay(PANIC_TIMER_STEP); - } - } - if (panic_timeout != 0) { - /* - * This will not be a clean reboot, with everything - * shutting down. But if there is a chance of - * rebooting the system it will be rebooted. - */ - emergency_restart(); - } -#ifdef __sparc__ - { - extern int stop_a_enabled; - /* Make sure the user can actually press Stop-A (L1-A) */ - stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); - } -#endif -#if defined(CONFIG_S390) - { - unsigned long caller; - - caller = (unsigned long)__builtin_return_address(0); - disabled_wait(caller); - } -#endif - local_irq_enable(); - for (i = 0; ; i += PANIC_TIMER_STEP) { - touch_softlockup_watchdog(); - if (i >= i_next) { - i += panic_blink(state ^= 1); - i_next = i + 3600 / PANIC_BLINK_SPD; - } - mdelay(PANIC_TIMER_STEP); - } -} - -EXPORT_SYMBOL(panic); - - -struct tnt { - u8 bit; - char true; - char false; -}; - -static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, - { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, - { TAINT_OOT_MODULE, 'O', ' ' }, -}; - -/** - * print_tainted - return a string to represent the kernel taint state. - * - * 'P' - Proprietary module has been loaded. - * 'F' - Module has been forcibly loaded. - * 'S' - SMP with CPUs not designed for SMP. - * 'R' - User forced a module unload. - * 'M' - System experienced a machine check exception. - * 'B' - System has hit bad_page. - * 'U' - Userspace-defined naughtiness. - * 'D' - Kernel has oopsed before - * 'A' - ACPI table overridden. - * 'W' - Taint on warning. - * 'C' - modules from drivers/staging are loaded. - * 'I' - Working around severe firmware bug. - * 'O' - Out-of-tree module has been loaded. - * - * The string is overwritten by the next call to print_tainted(). - */ -const char *print_tainted(void) -{ - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; - - if (tainted_mask) { - char *s; - int i; - - s = buf + sprintf(buf, "Tainted: "); - for (i = 0; i < ARRAY_SIZE(tnts); i++) { - const struct tnt *t = &tnts[i]; - *s++ = test_bit(t->bit, &tainted_mask) ? - t->true : t->false; - } - *s = 0; - } else - snprintf(buf, sizeof(buf), "Not tainted"); - - return buf; -} - -int test_taint(unsigned flag) -{ - return test_bit(flag, &tainted_mask); -} -EXPORT_SYMBOL(test_taint); - -unsigned long get_taint(void) -{ - return tainted_mask; -} - -void add_taint(unsigned flag) -{ - /* - * Can't trust the integrity of the kernel anymore. - * We don't call directly debug_locks_off() because the issue - * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. - */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } - - set_bit(flag, &tainted_mask); -} -EXPORT_SYMBOL(add_taint); - -static void spin_msec(int msecs) -{ - int i; - - for (i = 0; i < msecs; i++) { - touch_nmi_watchdog(); - mdelay(1); - } -} - -/* - * It just happens that oops_enter() and oops_exit() are identically - * implemented... - */ -static void do_oops_enter_exit(void) -{ - unsigned long flags; - static int spin_counter; - - if (!pause_on_oops) - return; - - spin_lock_irqsave(&pause_on_oops_lock, flags); - if (pause_on_oops_flag == 0) { - /* This CPU may now print the oops message */ - pause_on_oops_flag = 1; - } else { - /* We need to stall this CPU */ - if (!spin_counter) { - /* This CPU gets to do the counting */ - spin_counter = pause_on_oops; - do { - spin_unlock(&pause_on_oops_lock); - spin_msec(MSEC_PER_SEC); - spin_lock(&pause_on_oops_lock); - } while (--spin_counter); - pause_on_oops_flag = 0; - } else { - /* This CPU waits for a different one */ - while (spin_counter) { - spin_unlock(&pause_on_oops_lock); - spin_msec(1); - spin_lock(&pause_on_oops_lock); - } - } - } - spin_unlock_irqrestore(&pause_on_oops_lock, flags); -} - -/* - * Return true if the calling CPU is allowed to print oops-related info. - * This is a bit racy.. - */ -int oops_may_print(void) -{ - return pause_on_oops_flag == 0; -} - -/* - * Called when the architecture enters its oops handler, before it prints - * anything. If this is the first CPU to oops, and it's oopsing the first - * time then let it proceed. - * - * This is all enabled by the pause_on_oops kernel boot option. We do all - * this to ensure that oopses don't scroll off the screen. It has the - * side-effect of preventing later-oopsing CPUs from mucking up the display, - * too. - * - * It turns out that the CPU which is allowed to print ends up pausing for - * the right duration, whereas all the other CPUs pause for twice as long: - * once in oops_enter(), once in oops_exit(). - */ -void oops_enter(void) -{ - tracing_off(); - /* can't trust the integrity of the kernel anymore: */ - debug_locks_off(); - do_oops_enter_exit(); -} - -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - -void print_oops_end_marker(void) -{ - init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); -} - -/* - * Called when the architecture exits its oops handler, after printing - * everything. - */ -void oops_exit(void) -{ - do_oops_enter_exit(); - print_oops_end_marker(); - kmsg_dump(KMSG_DUMP_OOPS); -} - -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { - const char *fmt; - va_list args; -}; - -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) -{ - const char *board; - - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (board) - printk(KERN_WARNING "Hardware name: %s\n", board); - - if (args) - vprintk(args->fmt, args->args); - - print_modules(); - dump_stack(); - print_oops_end_marker(); - add_taint(taint); -} - -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ - struct slowpath_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) -{ - struct slowpath_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt_taint); - -void warn_slowpath_null(const char *file, int line) -{ - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); -#endif - -#ifdef CONFIG_CC_STACKPROTECTOR - -/* - * Called when gcc's -fstack-protector feature is used, and - * gcc detects corruption of the on-stack canary value - */ -void __stack_chk_fail(void) -{ - panic("stack-protector: Kernel stack is corrupted in: %p\n", - __builtin_return_address(0)); -} -EXPORT_SYMBOL(__stack_chk_fail); - -#endif - -core_param(panic, panic_timeout, int, 0644); -core_param(pause_on_oops, pause_on_oops, int, 0644); - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); -/* Helpers for initial module or kernel cmdline parsing - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Protects all parameters, and incidentally kmalloced_param list. */ -static DEFINE_MUTEX(param_lock); - -/* This just allows us to keep track of which parameters are kmalloced. */ -struct kmalloced_param { - struct list_head list; - char val[]; -}; -static LIST_HEAD(kmalloced_params); - -static void *kmalloc_parameter(unsigned int size) -{ - struct kmalloced_param *p; - - p = kmalloc(sizeof(*p) + size, GFP_KERNEL); - if (!p) - return NULL; - - list_add(&p->list, &kmalloced_params); - return p->val; -} - -/* Does nothing if parameter wasn't kmalloced above. */ -static void maybe_kfree_parameter(void *param) -{ - struct kmalloced_param *p; - - list_for_each_entry(p, &kmalloced_params, list) { - if (p->val == param) { - list_del(&p->list); - kfree(p); - break; - } - } -} - -static char dash2underscore(char c) -{ - if (c == '-') - return '_'; - return c; -} - -bool parameqn(const char *a, const char *b, size_t n) -{ - size_t i; - - for (i = 0; i < n; i++) { - if (dash2underscore(a[i]) != dash2underscore(b[i])) - return false; - } - return true; -} - -bool parameq(const char *a, const char *b) -{ - return parameqn(a, b, strlen(a)+1); -} - -static int parse_one(char *param, - char *val, - const struct kernel_param *params, - unsigned num_params, - int (*handle_unknown)(char *param, char *val)) -{ - unsigned int i; - int err; - - /* Find parameter */ - for (i = 0; i < num_params; i++) { - if (parameq(param, params[i].name)) { - /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool - && params[i].ops->set != param_set_bint) - return -EINVAL; - pr_debug("They are equal! Calling %p\n", - params[i].ops->set); - mutex_lock(¶m_lock); - err = params[i].ops->set(val, ¶ms[i]); - mutex_unlock(¶m_lock); - return err; - } - } - - if (handle_unknown) { - pr_debug("Unknown argument: calling %p\n", handle_unknown); - return handle_unknown(param, val); - } - - pr_debug("Unknown argument `%s'\n", param); - return -ENOENT; -} - -/* You can use " around spaces, but can't escape ". */ -/* Hyphens and underscores equivalent in parameter names. */ -static char *next_arg(char *args, char **param, char **val) -{ - unsigned int i, equals = 0; - int in_quote = 0, quoted = 0; - char *next; - - if (*args == '"') { - args++; - in_quote = 1; - quoted = 1; - } - - for (i = 0; args[i]; i++) { - if (isspace(args[i]) && !in_quote) - break; - if (equals == 0) { - if (args[i] == '=') - equals = i; - } - if (args[i] == '"') - in_quote = !in_quote; - } - - *param = args; - if (!equals) - *val = NULL; - else { - args[equals] = '\0'; - *val = args + equals + 1; - - /* Don't include quotes in value. */ - if (**val == '"') { - (*val)++; - if (args[i-1] == '"') - args[i-1] = '\0'; - } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; - } - - if (args[i]) { - args[i] = '\0'; - next = args + i + 1; - } else - next = args + i; - - /* Chew up trailing spaces. */ - return skip_spaces(next); -} - -/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *name, - char *args, - const struct kernel_param *params, - unsigned num, - int (*unknown)(char *param, char *val)) -{ - char *param, *val; - - pr_debug("Parsing ARGS: %s\n", args); - - /* Chew leading spaces */ - args = skip_spaces(args); - - while (*args) { - int ret; - int irq_was_disabled; - - args = next_arg(args, ¶m, &val); - irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, unknown); - if (irq_was_disabled && !irqs_disabled()) { - printk(KERN_WARNING "parse_args(): option '%s' enabled " - "irq's!\n", param); - } - switch (ret) { - case -ENOENT: - printk(KERN_ERR "%s: Unknown parameter `%s'\n", - name, param); - return ret; - case -ENOSPC: - printk(KERN_ERR - "%s: `%s' too large for parameter `%s'\n", - name, val ?: "", param); - return ret; - case 0: - break; - default: - printk(KERN_ERR - "%s: `%s' invalid for parameter `%s'\n", - name, val ?: "", param); - return ret; - } - } - - /* All parsed OK. */ - return 0; -} - -/* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ - int param_set_##name(const char *val, const struct kernel_param *kp) \ - { \ - tmptype l; \ - int ret; \ - \ - ret = strtolfn(val, 0, &l); \ - if (ret < 0 || ((type)l != l)) \ - return ret < 0 ? ret : -EINVAL; \ - *((type *)kp->arg) = l; \ - return 0; \ - } \ - int param_get_##name(char *buffer, const struct kernel_param *kp) \ - { \ - return sprintf(buffer, format, *((type *)kp->arg)); \ - } \ - struct kernel_param_ops param_ops_##name = { \ - .set = param_set_##name, \ - .get = param_get_##name, \ - }; \ - EXPORT_SYMBOL(param_set_##name); \ - EXPORT_SYMBOL(param_get_##name); \ - EXPORT_SYMBOL(param_ops_##name) - - -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); - -int param_set_charp(const char *val, const struct kernel_param *kp) -{ - if (strlen(val) > 1024) { - printk(KERN_ERR "%s: string parameter too long\n", - kp->name); - return -ENOSPC; - } - - maybe_kfree_parameter(*(char **)kp->arg); - - /* This is a hack. We can't kmalloc in early boot, and we - * don't need to; this mangled commandline is preserved. */ - if (slab_is_available()) { - *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); - if (!*(char **)kp->arg) - return -ENOMEM; - strcpy(*(char **)kp->arg, val); - } else - *(const char **)kp->arg = val; - - return 0; -} -EXPORT_SYMBOL(param_set_charp); - -int param_get_charp(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%s", *((char **)kp->arg)); -} -EXPORT_SYMBOL(param_get_charp); - -static void param_free_charp(void *arg) -{ - maybe_kfree_parameter(*((char **)arg)); -} - -struct kernel_param_ops param_ops_charp = { - .set = param_set_charp, - .get = param_get_charp, - .free = param_free_charp, -}; -EXPORT_SYMBOL(param_ops_charp); - -/* Actually could be a bool or an int, for historical reasons. */ -int param_set_bool(const char *val, const struct kernel_param *kp) -{ - bool v; - int ret; - - /* No equals means "set"... */ - if (!val) val = "1"; - - /* One of =[yYnN01] */ - ret = strtobool(val, &v); - if (ret) - return ret; - - if (kp->flags & KPARAM_ISBOOL) - *(bool *)kp->arg = v; - else - *(int *)kp->arg = v; - return 0; -} -EXPORT_SYMBOL(param_set_bool); - -int param_get_bool(char *buffer, const struct kernel_param *kp) -{ - bool val; - if (kp->flags & KPARAM_ISBOOL) - val = *(bool *)kp->arg; - else - val = *(int *)kp->arg; - - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", val ? 'Y' : 'N'); -} -EXPORT_SYMBOL(param_get_bool); - -struct kernel_param_ops param_ops_bool = { - .set = param_set_bool, - .get = param_get_bool, -}; -EXPORT_SYMBOL(param_ops_bool); - -/* This one must be bool. */ -int param_set_invbool(const char *val, const struct kernel_param *kp) -{ - int ret; - bool boolval; - struct kernel_param dummy; - - dummy.arg = &boolval; - dummy.flags = KPARAM_ISBOOL; - ret = param_set_bool(val, &dummy); - if (ret == 0) - *(bool *)kp->arg = !boolval; - return ret; -} -EXPORT_SYMBOL(param_set_invbool); - -int param_get_invbool(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); -} -EXPORT_SYMBOL(param_get_invbool); - -struct kernel_param_ops param_ops_invbool = { - .set = param_set_invbool, - .get = param_get_invbool, -}; -EXPORT_SYMBOL(param_ops_invbool); - -int param_set_bint(const char *val, const struct kernel_param *kp) -{ - struct kernel_param boolkp; - bool v; - int ret; - - /* Match bool exactly, by re-using it. */ - boolkp = *kp; - boolkp.arg = &v; - boolkp.flags |= KPARAM_ISBOOL; - - ret = param_set_bool(val, &boolkp); - if (ret == 0) - *(int *)kp->arg = v; - return ret; -} -EXPORT_SYMBOL(param_set_bint); - -struct kernel_param_ops param_ops_bint = { - .set = param_set_bint, - .get = param_get_int, -}; -EXPORT_SYMBOL(param_ops_bint); - -/* We break the rule and mangle the string. */ -static int param_array(const char *name, - const char *val, - unsigned int min, unsigned int max, - void *elem, int elemsize, - int (*set)(const char *, const struct kernel_param *kp), - u16 flags, - unsigned int *num) -{ - int ret; - struct kernel_param kp; - char save; - - /* Get the name right for errors. */ - kp.name = name; - kp.arg = elem; - kp.flags = flags; - - *num = 0; - /* We expect a comma-separated list of values. */ - do { - int len; - - if (*num == max) { - printk(KERN_ERR "%s: can only take %i arguments\n", - name, max); - return -EINVAL; - } - len = strcspn(val, ","); - - /* nul-terminate and parse */ - save = val[len]; - ((char *)val)[len] = '\0'; - BUG_ON(!mutex_is_locked(¶m_lock)); - ret = set(val, &kp); - - if (ret != 0) - return ret; - kp.arg += elemsize; - val += len+1; - (*num)++; - } while (save == ','); - - if (*num < min) { - printk(KERN_ERR "%s: needs at least %i arguments\n", - name, min); - return -EINVAL; - } - return 0; -} - -static int param_array_set(const char *val, const struct kernel_param *kp) -{ - const struct kparam_array *arr = kp->arr; - unsigned int temp_num; - - return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->ops->set, kp->flags, - arr->num ?: &temp_num); -} - -static int param_array_get(char *buffer, const struct kernel_param *kp) -{ - int i, off, ret; - const struct kparam_array *arr = kp->arr; - struct kernel_param p; - - p = *kp; - for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { - if (i) - buffer[off++] = ','; - p.arg = arr->elem + arr->elemsize * i; - BUG_ON(!mutex_is_locked(¶m_lock)); - ret = arr->ops->get(buffer + off, &p); - if (ret < 0) - return ret; - off += ret; - } - buffer[off] = '\0'; - return off; -} - -static void param_array_free(void *arg) -{ - unsigned int i; - const struct kparam_array *arr = arg; - - if (arr->ops->free) - for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) - arr->ops->free(arr->elem + arr->elemsize * i); -} - -struct kernel_param_ops param_array_ops = { - .set = param_array_set, - .get = param_array_get, - .free = param_array_free, -}; -EXPORT_SYMBOL(param_array_ops); - -int param_set_copystring(const char *val, const struct kernel_param *kp) -{ - const struct kparam_string *kps = kp->str; - - if (strlen(val)+1 > kps->maxlen) { - printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", - kp->name, kps->maxlen-1); - return -ENOSPC; - } - strcpy(kps->string, val); - return 0; -} -EXPORT_SYMBOL(param_set_copystring); - -int param_get_string(char *buffer, const struct kernel_param *kp) -{ - const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); -} -EXPORT_SYMBOL(param_get_string); - -struct kernel_param_ops param_ops_string = { - .set = param_set_copystring, - .get = param_get_string, -}; -EXPORT_SYMBOL(param_ops_string); - -/* sysfs output in /sys/modules/XYZ/parameters/ */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr) -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) - -extern struct kernel_param __start___param[], __stop___param[]; - -struct param_attribute -{ - struct module_attribute mattr; - const struct kernel_param *param; -}; - -struct module_param_attrs -{ - unsigned int num; - struct attribute_group grp; - struct param_attribute attrs[0]; -}; - -#ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr) - -static ssize_t param_attr_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) -{ - int count; - struct param_attribute *attribute = to_param_attr(mattr); - - if (!attribute->param->ops->get) - return -EPERM; - - mutex_lock(¶m_lock); - count = attribute->param->ops->get(buf, attribute->param); - mutex_unlock(¶m_lock); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } - return count; -} - -/* sysfs always hands a nul-terminated string in buf. We rely on that. */ -static ssize_t param_attr_store(struct module_attribute *mattr, - struct module_kobject *km, - const char *buf, size_t len) -{ - int err; - struct param_attribute *attribute = to_param_attr(mattr); - - if (!attribute->param->ops->set) - return -EPERM; - - mutex_lock(¶m_lock); - err = attribute->param->ops->set(buf, attribute->param); - mutex_unlock(¶m_lock); - if (!err) - return len; - return err; -} -#endif - -#ifdef CONFIG_MODULES -#define __modinit -#else -#define __modinit __init -#endif - -#ifdef CONFIG_SYSFS -void __kernel_param_lock(void) -{ - mutex_lock(¶m_lock); -} -EXPORT_SYMBOL(__kernel_param_lock); - -void __kernel_param_unlock(void) -{ - mutex_unlock(¶m_lock); -} -EXPORT_SYMBOL(__kernel_param_unlock); - -/* - * add_sysfs_param - add a parameter to sysfs - * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs - * @name: name of parameter - * - * Create a kobject if for a (per-module) parameter if mp NULL, and - * create file in sysfs. Returns an error on out of memory. Always cleans up - * if there's an error. - */ -static __modinit int add_sysfs_param(struct module_kobject *mk, - const struct kernel_param *kp, - const char *name) -{ - struct module_param_attrs *new; - struct attribute **attrs; - int err, num; - - /* We don't bother calling this with invisible parameters. */ - BUG_ON(!kp->perm); - - if (!mk->mp) { - num = 0; - attrs = NULL; - } else { - num = mk->mp->num; - attrs = mk->mp->grp.attrs; - } - - /* Enlarge. */ - new = krealloc(mk->mp, - sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), - GFP_KERNEL); - if (!new) { - kfree(mk->mp); - err = -ENOMEM; - goto fail; - } - attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); - if (!attrs) { - err = -ENOMEM; - goto fail_free_new; - } - - /* Sysfs wants everything zeroed. */ - memset(new, 0, sizeof(*new)); - memset(&new->attrs[num], 0, sizeof(new->attrs[num])); - memset(&attrs[num], 0, sizeof(attrs[num])); - new->grp.name = "parameters"; - new->grp.attrs = attrs; - - /* Tack new one on the end. */ - sysfs_attr_init(&new->attrs[num].mattr.attr); - new->attrs[num].param = kp; - new->attrs[num].mattr.show = param_attr_show; - new->attrs[num].mattr.store = param_attr_store; - new->attrs[num].mattr.attr.name = (char *)name; - new->attrs[num].mattr.attr.mode = kp->perm; - new->num = num+1; - - /* Fix up all the pointers, since krealloc can move us */ - for (num = 0; num < new->num; num++) - new->grp.attrs[num] = &new->attrs[num].mattr.attr; - new->grp.attrs[num] = NULL; - - mk->mp = new; - return 0; - -fail_free_new: - kfree(new); -fail: - mk->mp = NULL; - return err; -} - -#ifdef CONFIG_MODULES -static void free_module_param_attrs(struct module_kobject *mk) -{ - kfree(mk->mp->grp.attrs); - kfree(mk->mp); - mk->mp = NULL; -} - -/* - * module_param_sysfs_setup - setup sysfs support for one module - * @mod: module - * @kparam: module parameters (array) - * @num_params: number of module parameters - * - * Adds sysfs entries for module parameters under - * /sys/module/[mod->name]/parameters/ - */ -int module_param_sysfs_setup(struct module *mod, - const struct kernel_param *kparam, - unsigned int num_params) -{ - int i, err; - bool params = false; - - for (i = 0; i < num_params; i++) { - if (kparam[i].perm == 0) - continue; - err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); - if (err) - return err; - params = true; - } - - if (!params) - return 0; - - /* Create the param group. */ - err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); - if (err) - free_module_param_attrs(&mod->mkobj); - return err; -} - -/* - * module_param_sysfs_remove - remove sysfs support for one module - * @mod: module - * - * Remove sysfs entries for module parameters and the corresponding - * kobject. - */ -void module_param_sysfs_remove(struct module *mod) -{ - if (mod->mkobj.mp) { - sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); - /* We are positive that no one is using any param - * attrs at this point. Deallocate immediately. */ - free_module_param_attrs(&mod->mkobj); - } -} -#endif - -void destroy_params(const struct kernel_param *params, unsigned num) -{ - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].ops->free) - params[i].ops->free(params[i].arg); -} - -static struct module_kobject * __init locate_module_kobject(const char *name) -{ - struct module_kobject *mk; - struct kobject *kobj; - int err; - - kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - printk(KERN_ERR - "Module '%s' failed add to sysfs, error number %d\n", - name, err); - printk(KERN_ERR - "The system will be unstable now.\n"); - return NULL; - } - - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); - } - - return mk; -} - -static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, - unsigned int name_skip) -{ - struct module_kobject *mk; - int err; - - mk = locate_module_kobject(name); - if (!mk) - return; - - /* We need to remove old parameters before adding more. */ - if (mk->mp) - sysfs_remove_group(&mk->kobj, &mk->mp->grp); - - /* These should not fail at boot. */ - err = add_sysfs_param(mk, kparam, kparam->name + name_skip); - BUG_ON(err); - err = sysfs_create_group(&mk->kobj, &mk->mp->grp); - BUG_ON(err); - kobject_uevent(&mk->kobj, KOBJ_ADD); - kobject_put(&mk->kobj); -} - -/* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules - * - * Add module_parameters to sysfs for "modules" built into the kernel. - * - * The "module" name (KBUILD_MODNAME) is stored before a dot, the - * "parameter" name is stored behind a dot in kernel_param->name. So, - * extract the "module" name for all built-in kernel_param-eters, - * and for all who have the same, call kernel_add_sysfs_param. - */ -static void __init param_sysfs_builtin(void) -{ - struct kernel_param *kp; - unsigned int name_len; - char modname[MODULE_NAME_LEN]; - - for (kp = __start___param; kp < __stop___param; kp++) { - char *dot; - - if (kp->perm == 0) - continue; - - dot = strchr(kp->name, '.'); - if (!dot) { - /* This happens for core_param() */ - strcpy(modname, "kernel"); - name_len = 0; - } else { - name_len = dot - kp->name + 1; - strlcpy(modname, kp->name, name_len); - } - kernel_add_sysfs_param(modname, kp, name_len); - } -} - -ssize_t __modver_version_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) -{ - struct module_version_attribute *vattr = - container_of(mattr, struct module_version_attribute, mattr); - - return sprintf(buf, "%s\n", vattr->version); -} - -extern const struct module_version_attribute *__start___modver[]; -extern const struct module_version_attribute *__stop___modver[]; - -static void __init version_sysfs_builtin(void) -{ - const struct module_version_attribute **p; - struct module_kobject *mk; - int err; - - for (p = __start___modver; p < __stop___modver; p++) { - const struct module_version_attribute *vattr = *p; - - mk = locate_module_kobject(vattr->module_name); - if (mk) { - err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); - kobject_uevent(&mk->kobj, KOBJ_ADD); - kobject_put(&mk->kobj); - } - } -} - -/* module-related sysfs stuff */ - -static ssize_t module_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct module_attribute *attribute; - struct module_kobject *mk; - int ret; - - attribute = to_module_attr(attr); - mk = to_module_kobject(kobj); - - if (!attribute->show) - return -EIO; - - ret = attribute->show(attribute, mk, buf); - - return ret; -} - -static ssize_t module_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct module_attribute *attribute; - struct module_kobject *mk; - int ret; - - attribute = to_module_attr(attr); - mk = to_module_kobject(kobj); - - if (!attribute->store) - return -EIO; - - ret = attribute->store(attribute, mk, buf, len); - - return ret; -} - -static const struct sysfs_ops module_sysfs_ops = { - .show = module_attr_show, - .store = module_attr_store, -}; - -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &module_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops module_uevent_ops = { - .filter = uevent_filter, -}; - -struct kset *module_kset; -int module_sysfs_initialized; - -struct kobj_type module_ktype = { - .sysfs_ops = &module_sysfs_ops, -}; - -/* - * param_sysfs_init - wrapper for built-in params support - */ -static int __init param_sysfs_init(void) -{ - module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); - if (!module_kset) { - printk(KERN_WARNING "%s (%d): error creating kset\n", - __FILE__, __LINE__); - return -ENOMEM; - } - module_sysfs_initialized = 1; - - version_sysfs_builtin(); - param_sysfs_builtin(); - - return 0; -} -subsys_initcall(param_sysfs_init); - -#endif /* CONFIG_SYSFS */ -/* - * Generic pidhash and scalable, time-bounded PID allocator - * - * (C) 2002-2003 William Irwin, IBM - * (C) 2004 William Irwin, Oracle - * (C) 2002-2004 Ingo Molnar, Red Hat - * - * pid-structures are backing objects for tasks sharing a given ID to chain - * against. There is very little to them aside from hashing them and - * parking tasks using given ID's on a list. - * - * The hash is always changed with the tasklist_lock write-acquired, - * and the hash is only accessed with the tasklist_lock at least - * read-acquired, so there's no additional SMP locking needed here. - * - * We have a list of bitmap pages, which bitmaps represent the PID space. - * Allocating and freeing PIDs is completely lockless. The worst-case - * allocation scenario when all but one out of 1 million PIDs possible are - * allocated already: the scanning of 32 list entries and at most PAGE_SIZE - * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). - * - * Pid namespaces: - * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. - * (C) 2007 Sukadev Bhattiprolu , IBM - * Many thanks to Oleg Nesterov for comments and help - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; -struct pid init_struct_pid = INIT_STRUCT_PID; - -int pid_max = PID_MAX_DEFAULT; - -#define RESERVED_PIDS 300 - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; - -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) - -/* - * PID-map pages start out as NULL, they get allocated upon - * first use and are never deallocated. This way a low pid_max - * value does not cause lots of bitmaps to be allocated, but - * the scheme scales to up to 4 million PIDs, runtime. - */ -struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, - .level = 0, - .child_reaper = &init_task, -}; -EXPORT_SYMBOL_GPL(init_pid_ns); - -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); - -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - break; - } - if (likely(atomic_read(&map->nr_free))) { - do { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - pid = mk_pid(pid_ns, map, offset); - } while (offset < BITS_PER_PAGE && pid < pid_max); - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -1; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - -void put_pid(struct pid *pid) -{ - struct pid_namespace *ns; - - if (!pid) - return; - - ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { - kmem_cache_free(ns->pid_cachep, pid); - put_pid_ns(ns); - } -} -EXPORT_SYMBOL_GPL(put_pid); - -static void delayed_put_pid(struct rcu_head *rhp) -{ - struct pid *pid = container_of(rhp, struct pid, rcu); - put_pid(pid); -} - -void free_pid(struct pid *pid) -{ - /* We can be called with write_lock_irq(&tasklist_lock) held */ - int i; - unsigned long flags; - - spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); - spin_unlock_irqrestore(&pidmap_lock, flags); - - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - - call_rcu(&pid->rcu, delayed_put_pid); -} - -struct pid *alloc_pid(struct pid_namespace *ns) -{ - struct pid *pid; - enum pid_type type; - int i, nr; - struct pid_namespace *tmp; - struct upid *upid; - - pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); - if (!pid) - goto out; - - tmp = ns; - for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); - if (nr < 0) - goto out_free; - - pid->numbers[i].nr = nr; - pid->numbers[i].ns = tmp; - tmp = tmp->parent; - } - - get_pid_ns(ns); - pid->level = ns->level; - atomic_set(&pid->count, 1); - for (type = 0; type < PIDTYPE_MAX; ++type) - INIT_HLIST_HEAD(&pid->tasks[type]); - - upid = pid->numbers + ns->level; - spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - spin_unlock_irq(&pidmap_lock); - -out: - return pid; - -out_free: - while (++i <= ns->level) - free_pidmap(pid->numbers + i); - - kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; -} - -struct pid *find_pid_ns(int nr, struct pid_namespace *ns) -{ - struct hlist_node *elem; - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, elem, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; -} -EXPORT_SYMBOL_GPL(find_pid_ns); - -struct pid *find_vpid(int nr) -{ - return find_pid_ns(nr, current->nsproxy->pid_ns); -} -EXPORT_SYMBOL_GPL(find_vpid); - -/* - * attach_pid() must be called with the tasklist_lock write-held. - */ -void attach_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) -{ - struct pid_link *link; - - link = &task->pids[type]; - link->pid = pid; - hlist_add_head_rcu(&link->node, &pid->tasks[type]); -} - -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) -{ - struct pid_link *link; - struct pid *pid; - int tmp; - - link = &task->pids[type]; - pid = link->pid; - - hlist_del_rcu(&link->node); - link->pid = new; - - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) - return; - - free_pid(pid); -} - -void detach_pid(struct task_struct *task, enum pid_type type) -{ - __change_pid(task, type, NULL); -} - -void change_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) -{ - __change_pid(task, type, pid); - attach_pid(task, type, pid); -} - -/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ -void transfer_pid(struct task_struct *old, struct task_struct *new, - enum pid_type type) -{ - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); -} - -struct task_struct *pid_task(struct pid *pid, enum pid_type type) -{ - struct task_struct *result = NULL; - if (pid) { - struct hlist_node *first; - first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), - lockdep_tasklist_lock_is_held()); - if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); - } - return result; -} -EXPORT_SYMBOL(pid_task); - -/* - * Must be called under rcu_read_lock(). - */ -struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) -{ - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); - return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); -} - -struct task_struct *find_task_by_vpid(pid_t vnr) -{ - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); -} - -struct pid *get_task_pid(struct task_struct *task, enum pid_type type) -{ - struct pid *pid; - rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(task->pids[type].pid); - rcu_read_unlock(); - return pid; -} -EXPORT_SYMBOL_GPL(get_task_pid); - -struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) -{ - struct task_struct *result; - rcu_read_lock(); - result = pid_task(pid, type); - if (result) - get_task_struct(result); - rcu_read_unlock(); - return result; -} -EXPORT_SYMBOL_GPL(get_pid_task); - -struct pid *find_get_pid(pid_t nr) -{ - struct pid *pid; - - rcu_read_lock(); - pid = get_pid(find_vpid(nr)); - rcu_read_unlock(); - - return pid; -} -EXPORT_SYMBOL_GPL(find_get_pid); - -pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) -{ - struct upid *upid; - pid_t nr = 0; - - if (pid && ns->level <= pid->level) { - upid = &pid->numbers[ns->level]; - if (upid->ns == ns) - nr = upid->nr; - } - return nr; -} - -pid_t pid_vnr(struct pid *pid) -{ - return pid_nr_ns(pid, current->nsproxy->pid_ns); -} -EXPORT_SYMBOL_GPL(pid_vnr); - -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, - struct pid_namespace *ns) -{ - pid_t nr = 0; - - rcu_read_lock(); - if (!ns) - ns = current->nsproxy->pid_ns; - if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) - task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); - } - rcu_read_unlock(); - - return nr; -} -EXPORT_SYMBOL(__task_pid_nr_ns); - -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - -struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) -{ - return ns_of_pid(task_pid(tsk)); -} -EXPORT_SYMBOL_GPL(task_active_pid_ns); - -/* - * Used by proc to find the first pid that is greater than or equal to nr. - * - * If there is a pid at nr this function is exactly the same as find_pid_ns. - */ -struct pid *find_ge_pid(int nr, struct pid_namespace *ns) -{ - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; -} - -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - unsigned int i, pidhash_size; - - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, - &pidhash_shift, NULL, 4096); - pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); -} - -void __init pidmap_init(void) -{ - /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); - pid_max_min = max_t(int, pid_max_min, - PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); - - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); -} -/* - * Pid namespaces - * - * Authors: - * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. - * (C) 2007 Sukadev Bhattiprolu , IBM - * Many thanks to Oleg Nesterov for comments and help - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#define BITS_PER_PAGE (PAGE_SIZE*8) - -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); -static DEFINE_MUTEX(pid_caches_mutex); -static struct kmem_cache *pid_ns_cachep; - -/* - * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry - */ - -static struct kmem_cache *create_pid_cachep(int nr_ids) -{ - struct pid_cache *pcache; - struct kmem_cache *cachep; - - mutex_lock(&pid_caches_mutex); - list_for_each_entry(pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: - mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; -} - -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) -{ - struct pid_namespace *ns; - unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; - - ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); - if (ns == NULL) - goto out; - - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; - - ns->pid_cachep = create_pid_cachep(level + 1); - if (ns->pid_cachep == NULL) - goto out_free_map; - - kref_init(&ns->kref); - ns->level = level; - ns->parent = get_pid_ns(parent_pid_ns); - - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - - return ns; - -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); -out_free_map: - kfree(ns->pidmap[0].page); -out_free: - kmem_cache_free(pid_ns_cachep, ns); -out: - return ERR_PTR(err); -} - -static void destroy_pid_namespace(struct pid_namespace *ns) -{ - int i; - - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); - kmem_cache_free(pid_ns_cachep, ns); -} - -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) -{ - if (!(flags & CLONE_NEWPID)) - return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) - return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); -} - -void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns, *parent; - - ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; - destroy_pid_namespace(ns); - - if (parent != NULL) - put_pid_ns(parent); -} - -void zap_pid_ns_processes(struct pid_namespace *pid_ns) -{ - int nr; - int rc; - struct task_struct *task; - - /* - * The last thread in the cgroup-init thread group is terminating. - * Find remaining pid_ts in the namespace, signal and wait for them - * to exit. - * - * Note: This signals each threads in the namespace - even those that - * belong to the same thread group, To avoid this, we would have - * to walk the entire tasklist looking a processes in this - * namespace, but that could be unnecessarily expensive if the - * pid namespace has just a few processes. Or we need to - * maintain a tasklist for each pid namespace. - * - */ - read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - rcu_read_lock(); - - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ - task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); - - rcu_read_unlock(); - - nr = next_pidmap(pid_ns, nr); - } - read_unlock(&tasklist_lock); - - do { - clear_thread_flag(TIF_SIGPENDING); - rc = sys_wait4(-1, NULL, __WALL, NULL); - } while (rc != -ECHILD); - - acct_exit_ns(pid_ns); - return; -} - -static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table tmp = *table; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* - * Writing directly to ns' last_pid field is OK, since this field - * is volatile in a living namespace anyway and a code writing to - * it should synchronize its usage with external means. - */ - - tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); -} - -static struct ctl_table pid_ns_ctl_table[] = { - { - .procname = "ns_last_pid", - .maxlen = sizeof(int), - .mode = 0666, /* permissions are checked in the handler */ - .proc_handler = pid_ns_ctl_handler, - }, - { } -}; - -static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; - -static __init int pid_namespaces_init(void) -{ - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); - register_sysctl_paths(kern_path, pid_ns_ctl_table); - return 0; -} - -__initcall(pid_namespaces_init); -/* - * Implement CPU time clocks for the POSIX clock interface. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->cputime_expires expiration cache if necessary. Needs - * siglock protection since other code may update expiration cache as - * well. - */ -void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) -{ - cputime_t cputime = secs_to_cputime(rlim_new); - - spin_lock_irq(&task->sighand->siglock); - set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(&task->sighand->siglock); -} - -static int check_clock(const clockid_t which_clock) -{ - int error = 0; - struct task_struct *p; - const pid_t pid = CPUCLOCK_PID(which_clock); - - if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - if (pid == 0) - return 0; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : has_group_leader_pid(p))) { - error = -EINVAL; - } - rcu_read_unlock(); - - return error; -} - -static inline union cpu_time_count -timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) -{ - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; - } else { - ret.cpu = timespec_to_cputime(tp); - } - return ret; -} - -static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, - struct timespec *tp) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); - else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; -} - -/* - * Update expiry time from increment, and increase overrun count, - * given the current clock sample. - */ -static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) -{ - int i; - - if (timer->it.cpu.incr.sched == 0) - return; - - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; - - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; - - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } -} - -static inline cputime_t prof_ticks(struct task_struct *p) -{ - return p->utime + p->stime; -} -static inline cputime_t virt_ticks(struct task_struct *p) -{ - return p->utime; -} - -static int -posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) -{ - int error = check_clock(which_clock); - if (!error) { - tp->tv_sec = 0; - tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - /* - * If sched_clock is using a cycle counter, we - * don't have any idea of its true resolution - * exported, but it is much more than 1s/HZ. - */ - tp->tv_nsec = 1; - } - } - return error; -} - -static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) -{ - /* - * You can never reset a CPU clock, but we check for other errors - * in the call before failing with EPERM. - */ - int error = check_clock(which_clock); - if (error == 0) { - error = -EPERM; - } - return error; -} - - -/* - * Sample a per-thread clock for the given task. - */ -static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) -{ - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); - break; - case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); - break; - case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); - break; - } - return 0; -} - -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) -{ - if (b->utime > a->utime) - a->utime = b->utime; - - if (b->stime > a->stime) - a->stime = b->stime; - - if (b->sum_exec_runtime > a->sum_exec_runtime) - a->sum_exec_runtime = b->sum_exec_runtime; -} - -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) -{ - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - struct task_cputime sum; - unsigned long flags; - - if (!cputimer->running) { - /* - * The POSIX timer interface allows for absolute time expiry - * values through the TIMER_ABSTIME flag, therefore we have - * to synchronize the timer to the clock every time we start - * it. - */ - thread_group_cputime(tsk, &sum); - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 1; - update_gt_cputime(&cputimer->cputime, &sum); - } else - raw_spin_lock_irqsave(&cputimer->lock, flags); - *times = cputimer->cputime; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); -} - -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - struct task_cputime cputime; - - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; - break; - case CPUCLOCK_VIRT: - thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; - break; - case CPUCLOCK_SCHED: - thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; - break; - } - return 0; -} - - -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) -{ - const pid_t pid = CPUCLOCK_PID(which_clock); - int error = -EINVAL; - union cpu_time_count rtn; - - if (pid == 0) { - /* - * Special case constant value for our own clocks. - * We don't have to do any lookup to find ourselves. - */ - if (CPUCLOCK_PERTHREAD(which_clock)) { - /* - * Sampling just ourselves we can do with no locking. - */ - error = cpu_clock_sample(which_clock, - current, &rtn); - } else { - read_lock(&tasklist_lock); - error = cpu_clock_sample_group(which_clock, - current, &rtn); - read_unlock(&tasklist_lock); - } - } else { - /* - * Find the given PID, and validate that the caller - * should be able to see it. - */ - struct task_struct *p; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) { - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(p, current)) { - error = cpu_clock_sample(which_clock, - p, &rtn); - } - } else { - read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->sighand) { - error = - cpu_clock_sample_group(which_clock, - p, &rtn); - } - read_unlock(&tasklist_lock); - } - } - rcu_read_unlock(); - } - - if (error) - return error; - sample_to_timespec(which_clock, rtn, tp); - return 0; -} - - -/* - * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create() and do_cpu_nanosleep() with the - * new timer already all-zeros initialized. - */ -static int posix_cpu_timer_create(struct k_itimer *new_timer) -{ - int ret = 0; - const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); - struct task_struct *p; - - if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) - return -EINVAL; - - INIT_LIST_HEAD(&new_timer->it.cpu.entry); - - rcu_read_lock(); - if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { - if (pid == 0) { - p = current; - } else { - p = find_task_by_vpid(pid); - if (p && !same_thread_group(p, current)) - p = NULL; - } - } else { - if (pid == 0) { - p = current->group_leader; - } else { - p = find_task_by_vpid(pid); - if (p && !has_group_leader_pid(p)) - p = NULL; - } - } - new_timer->it.cpu.task = p; - if (p) { - get_task_struct(p); - } else { - ret = -EINVAL; - } - rcu_read_unlock(); - - return ret; -} - -/* - * Clean up a CPU-clock timer that is about to be destroyed. - * This is called from timer deletion with the timer already locked. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -static int posix_cpu_timer_del(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - int ret = 0; - - if (likely(p != NULL)) { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { - /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. - */ - BUG_ON(!list_empty(&timer->it.cpu.entry)); - } else { - spin_lock(&p->sighand->siglock); - if (timer->it.cpu.firing) - ret = TIMER_RETRY; - else - list_del(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!ret) - put_task_struct(p); - } - - return ret; -} - -/* - * Clean out CPU timers still ticking when a thread exited. The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. - * This must be called with the siglock held. - */ -static void cleanup_timers(struct list_head *head, - cputime_t utime, cputime_t stime, - unsigned long long sum_exec_runtime) -{ - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= ptime; - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= utime; - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; - } else { - timer->expires.sched -= sum_exec_runtime; - } - } -} - -/* - * These are both called with the siglock held, when the current thread - * is being reaped. When the final (leader) thread in the group is reaped, - * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. - */ -void posix_cpu_timers_exit(struct task_struct *tsk) -{ - cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); - -} -void posix_cpu_timers_exit_group(struct task_struct *tsk) -{ - struct signal_struct *const sig = tsk->signal; - - cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, - tsk->se.sum_exec_runtime + sig->sum_sched_runtime); -} - -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) -{ - /* - * That's all for this thread or process. - * We leave our residual in expires to be reported. - */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); -} - -static inline int expires_gt(cputime_t expires, cputime_t new_exp) -{ - return expires == 0 || expires > new_exp; -} - -/* - * Insert the timer on the appropriate list before any timers that - * expire later. This must be called with the tasklist_lock held - * for reading, interrupts disabled and p->sighand->siglock taken. - */ -static void arm_timer(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - struct list_head *head, *listpos; - struct task_cputime *cputime_expires; - struct cpu_timer_list *const nt = &timer->it.cpu; - struct cpu_timer_list *next; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->cpu_timers; - cputime_expires = &p->cputime_expires; - } else { - head = p->signal->cpu_timers; - cputime_expires = &p->signal->cputime_expires; - } - head += CPUCLOCK_WHICH(timer->it_clock); - - listpos = head; - list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) - break; - listpos = &next->entry; - } - list_add(&nt->entry, listpos); - - if (listpos == head) { - union cpu_time_count *exp = &nt->expires; - - /* - * We are the new earliest-expiring POSIX 1.b timer, hence - * need to update expiration cache. Take into account that - * for process timers we share expiration cache with itimers - * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. - */ - - switch (CPUCLOCK_WHICH(timer->it_clock)) { - case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; - break; - case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; - break; - case CPUCLOCK_SCHED: - if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; - break; - } - } -} - -/* - * The timer is locked, fire it and arrange for its reload. - */ -static void cpu_timer_fire(struct k_itimer *timer) -{ - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - /* - * User don't want any signal. - */ - timer->it.cpu.expires.sched = 0; - } else if (unlikely(timer->sigq == NULL)) { - /* - * This a special case for clock_nanosleep, - * not a normal timer from sys_timer_create. - */ - wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { - /* - * One-shot timer. Clear it as soon as it's fired. - */ - posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { - /* - * The signal did not get queued because the signal - * was ignored, so we won't get any callback to - * reload the timer. But we need to keep it - * ticking in case the signal is deliverable next time. - */ - posix_cpu_timer_schedule(timer); - } -} - -/* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - struct task_cputime cputime; - - thread_group_cputimer(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; - break; - case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; - break; - case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); - break; - } - return 0; -} - -/* - * Guts of sys_timer_settime for CPU timers. - * This is called with the timer locked and interrupts disabled. - * If we return TIMER_RETRY, it's necessary to release the timer's lock - * and try again. (This happens when the timer is in the middle of firing.) - */ -static int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) -{ - struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; - int ret; - - if (unlikely(p == NULL)) { - /* - * Timer refers to a dead task's clock. - */ - return -ESRCH; - } - - new_expires = timespec_to_sample(timer->it_clock, &new->it_value); - - read_lock(&tasklist_lock); - /* - * We need the tasklist_lock to protect against reaping that - * clears p->sighand. If p has just been reaped, we can no - * longer get any information about it at all. - */ - if (unlikely(p->sighand == NULL)) { - read_unlock(&tasklist_lock); - put_task_struct(p); - timer->it.cpu.task = NULL; - return -ESRCH; - } - - /* - * Disarm any old timer after extracting its expiry time. - */ - BUG_ON(!irqs_disabled()); - - ret = 0; - old_incr = timer->it.cpu.incr; - spin_lock(&p->sighand->siglock); - old_expires = timer->it.cpu.expires; - if (unlikely(timer->it.cpu.firing)) { - timer->it.cpu.firing = -1; - ret = TIMER_RETRY; - } else - list_del_init(&timer->it.cpu.entry); - - /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &val); - } else { - cpu_timer_sample_group(timer->it_clock, p, &val); - } - - if (old) { - if (old_expires.sched == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has - * overrun already. If it has, - * we'll report it as having overrun - * and with the next reloaded timer - * already ticking, though we are - * swallowing that pending - * notification here to install the - * new setting. - */ - bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); - sample_to_timespec(timer->it_clock, - old_expires, - &old->it_value); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } - } - - if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); - goto out; - } - - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); - } - - /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). - */ - timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { - arm_timer(timer); - } - - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); - - /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. - */ - timer->it.cpu.incr = timespec_to_sample(timer->it_clock, - &new->it_interval); - - /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. - */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } - - ret = 0; - out: - if (old) { - sample_to_timespec(timer->it_clock, - old_incr, &old->it_interval); - } - return ret; -} - -static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) -{ - union cpu_time_count now; - struct task_struct *p = timer->it.cpu.task; - int clear_dead; - - /* - * Easy part: convert the reload time. - */ - sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &itp->it_interval); - - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - - if (unlikely(p == NULL)) { - /* - * This task already died and the timer will never fire. - * In this case, expires is actually the dead value. - */ - dead: - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); - return; - } - - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - clear_dead = p->exit_state; - } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Call the timer disarmed, nothing else to do. - */ - put_task_struct(p); - timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; - read_unlock(&tasklist_lock); - goto dead; - } else { - cpu_timer_sample_group(timer->it_clock, p, &now); - clear_dead = (unlikely(p->exit_state) && - thread_group_empty(p)); - } - read_unlock(&tasklist_lock); - } - - if (unlikely(clear_dead)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto dead; - } - - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { - sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), - &itp->it_value); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; - } -} - -/* - * Check for any per-thread CPU timers that have fired and move them off - * the tsk->cpu_timers[N] list onto the firing list. Here we update the - * tsk->it_*_expires values to reflect the remaining thread CPU timers. - */ -static void check_thread_timers(struct task_struct *tsk, - struct list_head *firing) -{ - int maxfire; - struct list_head *timers = tsk->cpu_timers; - struct signal_struct *const sig = tsk->signal; - unsigned long soft; - - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } - - /* - * Check for the special case thread timers. - */ - soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); - if (soft != RLIM_INFINITY) { - unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); - - if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (soft < hard) { - soft += USEC_PER_SEC; - sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; - } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -} - -static void stop_process_timers(struct signal_struct *sig) -{ - struct thread_group_cputimer *cputimer = &sig->cputimer; - unsigned long flags; - - raw_spin_lock_irqsave(&cputimer->lock, flags); - cputimer->running = 0; - raw_spin_unlock_irqrestore(&cputimer->lock, flags); -} - -static u32 onecputick; - -static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) -{ - if (!it->expires) - return; - - if (cur_time >= it->expires) { - if (it->incr) { - it->expires += it->incr; - it->error += it->incr_error; - if (it->error >= onecputick) { - it->expires -= cputime_one_jiffy; - it->error -= onecputick; - } - } else { - it->expires = 0; - } - - trace_itimer_expire(signo == SIGPROF ? - ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); - } - - if (it->expires && (!*expires || it->expires < *expires)) { - *expires = it->expires; - } -} - -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - -/* - * Check for any per-thread CPU timers that have fired and move them - * off the tsk->*_timers list onto the firing list. Per-thread timers - * have already been taken off. - */ -static void check_process_timers(struct task_struct *tsk, - struct list_head *firing) -{ - int maxfire; - struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; - unsigned long long sum_sched_runtime, sched_expires; - struct list_head *timers = sig->cpu_timers; - struct task_cputime cputime; - unsigned long soft; - - /* - * Collect the current process totals. - */ - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; - sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - /* - * Check for the special case process timers. - */ - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, - SIGPROF); - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, - SIGVTALRM); - soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); - if (soft != RLIM_INFINITY) { - unsigned long psecs = cputime_to_secs(ptime); - unsigned long hard = - ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); - cputime_t x; - if (psecs >= hard) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } - if (psecs >= soft) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (soft < hard) { - soft++; - sig->rlim[RLIMIT_CPU].rlim_cur = soft; - } - } - x = secs_to_cputime(soft); - if (!prof_expires || x < prof_expires) { - prof_expires = x; - } - } - - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; - sig->cputime_expires.sched_exp = sched_expires; - if (task_cputime_zero(&sig->cputime_expires)) - stop_process_timers(sig); -} - -/* - * This is called from the signal code (via do_schedule_next_timer) - * when the last timer signal was delivered and we have to reload the timer. - */ -void posix_cpu_timer_schedule(struct k_itimer *timer) -{ - struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; - - if (unlikely(p == NULL)) - /* - * The task was cleaned up already, no future firings. - */ - goto out; - - /* - * Fetch the current sample and update the timer's expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) { - clear_dead_task(timer, now); - goto out; - } - read_lock(&tasklist_lock); /* arm_timer needs it. */ - spin_lock(&p->sighand->siglock); - } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - put_task_struct(p); - timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; - goto out_unlock; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto out_unlock; - } - spin_lock(&p->sighand->siglock); - cpu_timer_sample_group(timer->it_clock, p, &now); - bump_cpu_timer(timer, now); - /* Leave the tasklist_lock locked for the call below. */ - } - - /* - * Now re-arm for the new expiry time. - */ - BUG_ON(!irqs_disabled()); - arm_timer(timer); - spin_unlock(&p->sighand->siglock); - -out_unlock: - read_unlock(&tasklist_lock); - -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; -} - -/** - * task_cputime_expired - Compare two task_cputime entities. - * - * @sample: The task_cputime structure to be checked for expiration. - * @expires: Expiration times, against which @sample will be checked. - * - * Checks @sample against @expires to see if any field of @sample has expired. - * Returns true if any field of the former is greater than the corresponding - * field of the latter if the latter field is set. Otherwise returns false. - */ -static inline int task_cputime_expired(const struct task_cputime *sample, - const struct task_cputime *expires) -{ - if (expires->utime && sample->utime >= expires->utime) - return 1; - if (expires->stime && sample->utime + sample->stime >= expires->stime) - return 1; - if (expires->sum_exec_runtime != 0 && - sample->sum_exec_runtime >= expires->sum_exec_runtime) - return 1; - return 0; -} - -/** - * fastpath_timer_check - POSIX CPU timers fast path. - * - * @tsk: The task (thread) being checked. - * - * Check the task and thread group timers. If both are zero (there are no - * timers set) return false. Otherwise snapshot the task and thread group - * timers and compare them with the corresponding expiration times. Return - * true if a timer has expired, else return false. - */ -static inline int fastpath_timer_check(struct task_struct *tsk) -{ - struct signal_struct *sig; - - if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; - - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; - } - - sig = tsk->signal; - if (sig->cputimer.running) { - struct task_cputime group_sample; - - raw_spin_lock(&sig->cputimer.lock); - group_sample = sig->cputimer.cputime; - raw_spin_unlock(&sig->cputimer.lock); - - if (task_cputime_expired(&group_sample, &sig->cputime_expires)) - return 1; - } - - return 0; -} - -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(struct task_struct *tsk) -{ - LIST_HEAD(firing); - struct k_itimer *timer, *next; - unsigned long flags; - - BUG_ON(!irqs_disabled()); - - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) - return; - - if (!lock_task_sighand(tsk, &flags)) - return; - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); - /* - * If there are any active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) cputimer must be running. - */ - if (tsk->signal->cputimer.running) - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - unlock_task_sighand(tsk, &flags); - - /* - * Now that all the timers on our list have the firing flag, - * no one will touch their list entries but us. We'll take - * each timer's lock before clearing its firing flag, so no - * timer call will interfere. - */ - list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int cpu_firing; - - spin_lock(&timer->it_lock); - list_del_init(&timer->it.cpu.entry); - cpu_firing = timer->it.cpu.firing; - timer->it.cpu.firing = 0; - /* - * The firing flag is -1 if we collided with a reset - * of the timer, which already reported this - * almost-firing as an overrun. So don't generate an event. - */ - if (likely(cpu_firing >= 0)) - cpu_timer_fire(timer); - spin_unlock(&timer->it_lock); - } -} - -/* - * Set one of the process-wide special case CPU timers or RLIMIT_CPU. - * The tsk->sighand->siglock must be held by the caller. - */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, - cputime_t *newval, cputime_t *oldval) -{ - union cpu_time_count now; - - BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); - - if (oldval) { - /* - * We are setting itimer. The *oldval is absolute and we update - * it to be relative, *newval argument is relative and we update - * it to be absolute. - */ - if (*oldval) { - if (*oldval <= now.cpu) { - /* Just about to fire. */ - *oldval = cputime_one_jiffy; - } else { - *oldval -= now.cpu; - } - } - - if (!*newval) - return; - *newval += now.cpu; - } - - /* - * Update expiration cache if we are the earliest timer, or eventually - * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. - */ - switch (clock_idx) { - case CPUCLOCK_PROF: - if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) - tsk->signal->cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: - if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) - tsk->signal->cputime_expires.virt_exp = *newval; - break; - } -} - -static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) -{ - struct k_itimer timer; - int error; - - /* - * Set up a temporary timer and then wait for it to go off. - */ - memset(&timer, 0, sizeof timer); - spin_lock_init(&timer.it_lock); - timer.it_clock = which_clock; - timer.it_overrun = -1; - error = posix_cpu_timer_create(&timer); - timer.it_process = current; - if (!error) { - static struct itimerspec zero_it; - - memset(it, 0, sizeof *it); - it->it_value = *rqtp; - - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); - if (error) { - spin_unlock_irq(&timer.it_lock); - return error; - } - - while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { - /* - * Our timer fired and was reset. - */ - spin_unlock_irq(&timer.it_lock); - return 0; - } - - /* - * Block until cpu_timer_fire (or a signal) wakes us. - */ - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&timer.it_lock); - schedule(); - spin_lock_irq(&timer.it_lock); - } - - /* - * We were interrupted by a signal. - */ - sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, it); - spin_unlock_irq(&timer.it_lock); - - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { - /* - * It actually did fire already. - */ - return 0; - } - - error = -ERESTART_RESTARTBLOCK; - } - - return error; -} - -static long posix_cpu_nsleep_restart(struct restart_block *restart_block); - -static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; - struct itimerspec it; - int error; - - /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - - if (flags & TIMER_ABSTIME) - return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->nanosleep.clockid = which_clock; - restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec_to_ns(rqtp); - } - return error; -} - -static long posix_cpu_nsleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->nanosleep.clockid; - struct timespec t; - struct itimerspec it; - int error; - - t = ns_to_timespec(restart_block->nanosleep.expires); - - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - struct timespec __user *rmtp = restart_block->nanosleep.rmtp; - /* - * Report back to the user the time still remaining. - */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) - return -EFAULT; - - restart_block->nanosleep.expires = timespec_to_ns(&t); - } - return error; - -} - -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) - -static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(PROCESS_CLOCK, tp); -} -static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(PROCESS_CLOCK, tp); -} -static int process_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = PROCESS_CLOCK; - return posix_cpu_timer_create(timer); -} -static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, - struct timespec __user *rmtp) -{ - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); -} -static long process_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} -static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_getres(THREAD_CLOCK, tp); -} -static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) -{ - return posix_cpu_clock_get(THREAD_CLOCK, tp); -} -static int thread_cpu_timer_create(struct k_itimer *timer) -{ - timer->it_clock = THREAD_CLOCK; - return posix_cpu_timer_create(timer); -} - -struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .nsleep_restart = posix_cpu_nsleep_restart, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, -}; - -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - struct timespec ts; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - - cputime_to_timespec(cputime_one_jiffy, &ts); - onecputick = ts.tv_nsec; - WARN_ON(ts.tv_sec != 0); - - return 0; -} -__initcall(init_posix_cpu_timers); -/* - * linux/kernel/posix-timers.c - * - * - * 2002-10-15 Posix Clocks & timers - * by George Anzinger george@mvista.com - * - * Copyright (C) 2002 2003 by MontaVista Software. - * - * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. - * Copyright (C) 2004 Boris Hu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA - */ - -/* These are all the functions necessary to implement - * POSIX clocks & timers - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to - * void idr_remove(struct idr *idp, int id); to release - * void idr_init(struct idr *idp); to initialize - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. - */ - -/* - * Lets keep our timers in a slab cache :-) - */ -static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); - -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ -#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) -#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" -#endif - -/* - * parisc wants ENOTSUP instead of EOPNOTSUPP - */ -#ifndef ENOTSUP -# define ENANOSLEEP_NOTSUP EOPNOTSUPP -#else -# define ENANOSLEEP_NOTSUP ENOTSUP -#endif - -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ - -static struct k_clock posix_clocks[MAX_CLOCKS]; - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, - struct timespec __user *rmtp); -static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); - -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ -}) - -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) -{ - spin_unlock_irqrestore(&timr->it_lock, flags); -} - -/* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_real_ts(tp); - return 0; -} - -/* Set clock_realtime */ -static int posix_clock_realtime_set(const clockid_t which_clock, - const struct timespec *tp) -{ - return do_sys_settimeofday(tp, NULL); -} - -static int posix_clock_realtime_adj(const clockid_t which_clock, - struct timex *t) -{ - return do_adjtimex(t); -} - -/* - * Get monotonic time for posix timers - */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_ts(tp); - return 0; -} - -/* - * Get monotonic-raw time for posix timers - */ -static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) -{ - getrawmonotonic(tp); - return 0; -} - - -static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) -{ - *tp = current_kernel_time(); - return 0; -} - -static int posix_get_monotonic_coarse(clockid_t which_clock, - struct timespec *tp) -{ - *tp = get_monotonic_coarse(); - return 0; -} - -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) -{ - *tp = ktime_to_timespec(KTIME_LOW_RES); - return 0; -} - -static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) -{ - get_monotonic_boottime(tp); - return 0; -} - - -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ -static __init int init_posix_timers(void) -{ - struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); - idr_init(&posix_timers_id); - return 0; -} - -__initcall(init_posix_timers); - -static void schedule_next_timer(struct k_itimer *timr) -{ - struct hrtimer *timer = &timr->it.real.timer; - - if (timr->it.real.interval.tv64 == 0) - return; - - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; - hrtimer_restart(timer); -} - -/* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. - */ -void do_schedule_next_timer(struct siginfo *info) -{ - struct k_itimer *timr; - unsigned long flags; - - timr = lock_timer(info->si_tid, &flags); - - if (timr && timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); - - info->si_overrun += timr->it_overrun_last; - } - - if (timr) - unlock_timer(timr, flags); -} - -int posix_timer_event(struct k_itimer *timr, int si_private) -{ - struct task_struct *task; - int shared, ret = -1; - /* - * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). - * - * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). - * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer - * and re-schedules it while ->sigq is pending. - * Not really bad, but not that we want. - */ - timr->sigq->info.si_sys_private = si_private; - - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); - /* If we failed to send the signal the timer stops. */ - return ret > 0; -} -EXPORT_SYMBOL_GPL(posix_timer_event); - -/* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. - */ -static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) -{ - struct k_itimer *timr; - unsigned long flags; - int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; - - timr = container_of(timer, struct k_itimer, it.real.timer); - spin_lock_irqsave(&timr->it_lock, flags); - - if (timr->it.real.interval.tv64 != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { - /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. - */ - if (timr->it.real.interval.tv64 != 0) { - ktime_t now = hrtimer_cb_get_time(timer); - - /* - * FIXME: What we really want, is to stop this - * timer completely and restart it in case the - * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. - * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. - */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); - - if (timr->it.real.interval.tv64 < kj.tv64) - now = ktime_add(now, kj); - } -#endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, - timr->it.real.interval); - ret = HRTIMER_RESTART; - ++timr->it_requeue_pending; - } - } - - unlock_timer(timr, flags); - return ret; -} - -static struct pid *good_sigevent(sigevent_t * event) -{ - struct task_struct *rtn = current->group_leader; - - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) - return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); -} - -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - -static struct k_itimer * alloc_posix_timer(void) -{ - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); - if (!tmr) - return tmr; - if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { - kmem_cache_free(posix_timers_cache, tmr); - return NULL; - } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); - return tmr; -} - -static void k_itimer_rcu_free(struct rcu_head *head) -{ - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); - - kmem_cache_free(posix_timers_cache, tmr); -} - -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) -{ - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); - } - put_pid(tmr->it_pid); - sigqueue_free(tmr->sigq); - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); -} - -static struct k_clock *clockid_to_kclock(const clockid_t id) -{ - if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? - &clock_posix_dynamic : &clock_posix_cpu; - - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) - return NULL; - return &posix_clocks[id]; -} - -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -/* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct k_itimer *new_timer; - int error, new_timer_id; - sigevent_t event; - int it_id_set = IT_ID_NOT_SET; - - if (!kc) - return -EINVAL; - if (!kc->timer_create) - return -EOPNOTSUPP; - - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - retry: - if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { - error = -EAGAIN; - goto out; - } - spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); - spin_unlock_irq(&idr_lock); - if (error) { - if (error == -EAGAIN) - goto retry; - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - error = -EAGAIN; - goto out; - } - - it_id_set = IT_ID_SET; - new_timer->it_id = (timer_t) new_timer_id; - new_timer->it_clock = which_clock; - new_timer->it_overrun = -1; - - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(&event)); - rcu_read_unlock(); - if (!new_timer->it_pid) { - error = -EINVAL; - goto out; - } - } else { - event.sigev_notify = SIGEV_SIGNAL; - event.sigev_signo = SIGALRM; - event.sigev_value.sival_int = new_timer->it_id; - new_timer->it_pid = get_pid(task_tgid(current)); - } - - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->sigq->info.si_signo = event.sigev_signo; - new_timer->sigq->info.si_value = event.sigev_value; - new_timer->sigq->info.si_tid = new_timer->it_id; - new_timer->sigq->info.si_code = SI_TIMER; - - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } - - error = kc->timer_create(new_timer); - if (error) - goto out; - - spin_lock_irq(¤t->sighand->siglock); - new_timer->it_signal = current->signal; - list_add(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); - - return 0; - /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. - */ -out: - release_posix_timer(new_timer, it_id_set); - return error; -} - -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) -{ - struct k_itimer *timr; - - rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); - if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); - if (timr->it_signal == current->signal) { - rcu_read_unlock(); - return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); - } - rcu_read_unlock(); - - return NULL; -} - -/* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. - * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. - * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. - */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) -{ - ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; - - memset(cur_setting, 0, sizeof(struct itimerspec)); - - iv = timr->it.real.interval; - - /* interval timer ? */ - if (iv.tv64) - cur_setting->it_interval = ktime_to_timespec(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; - - now = timer->base->get_time(); - - /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. - */ - if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - - remaining = ktime_sub(hrtimer_get_expires(timer), now); - /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) { - /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! - */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - cur_setting->it_value.tv_nsec = 1; - } else - cur_setting->it_value = ktime_to_timespec(remaining); -} - -/* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) -{ - struct itimerspec cur_setting; - struct k_itimer *timr; - struct k_clock *kc; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - kc = clockid_to_kclock(timr->it_clock); - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, &cur_setting); - - unlock_timer(timr, flags); - - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; - - return ret; -} - -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is - * to pick up the frozen overrun. - */ -SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) -{ - struct k_itimer *timr; - int overrun; - unsigned long flags; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timr->it_overrun_last; - unlock_timer(timr, flags); - - return overrun; -} - -/* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) -{ - struct hrtimer *timer = &timr->it.real.timer; - enum hrtimer_mode mode; - - if (old_setting) - common_timer_get(timr, old_setting); - - /* disable the timer */ - timr->it.real.interval.tv64 = 0; - /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. - */ - if (hrtimer_try_to_cancel(timer) < 0) - return TIMER_RETRY; - - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; - - /* switch off the timer when it_value is zero */ - if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) - return 0; - - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); - - /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); - - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - hrtimer_add_expires(timer, timer->base->get_time()); - } - return 0; - } - - hrtimer_start_expires(timer, mode); - return 0; -} - -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) -{ - struct k_itimer *timr; - struct itimerspec new_spec, old_spec; - int error = 0; - unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; - struct k_clock *kc; - - if (!new_setting) - return -EINVAL; - - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) - return -EINVAL; -retry: - timr = lock_timer(timer_id, &flag); - if (!timr) - return -EINVAL; - - kc = clockid_to_kclock(timr->it_clock); - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, flags, &new_spec, rtn); - - unlock_timer(timr, flag); - if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... - goto retry; - } - - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; - - return error; -} - -static int common_timer_del(struct k_itimer *timer) -{ - timer->it.real.interval.tv64 = 0; - - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) - return TIMER_RETRY; - return 0; -} - -static inline int timer_delete_hook(struct k_itimer *timer) -{ - struct k_clock *kc = clockid_to_kclock(timer->it_clock); - - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); -} - -/* Delete a POSIX.1b interval timer. */ -SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) -{ - struct k_itimer *timer; - unsigned long flags; - -retry_delete: - timer = lock_timer(timer_id, &flags); - if (!timer) - return -EINVAL; - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - - spin_lock(¤t->sighand->siglock); - list_del(&timer->list); - spin_unlock(¤t->sighand->siglock); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); - return 0; -} - -/* - * return timer owned by the process, used by exit_itimers - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - -retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); - - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); - goto retry_delete; - } - list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - - unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); -} - -/* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. - */ -void exit_itimers(struct signal_struct *sig) -{ - struct k_itimer *tmr; - - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); - itimer_delete(tmr); - } -} - -SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec new_tp; - - if (!kc || !kc->clock_set) - return -EINVAL; - - if (copy_from_user(&new_tp, tp, sizeof (*tp))) - return -EFAULT; - - return kc->clock_set(which_clock, &new_tp); -} - -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec kernel_tp; - int error; - - if (!kc) - return -EINVAL; - - error = kc->clock_get(which_clock, &kernel_tp); - - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) - error = -EFAULT; - - return error; -} - -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, - struct timex __user *, utx) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; - int err; - - if (!kc) - return -EINVAL; - if (!kc->clock_adj) - return -EOPNOTSUPP; - - if (copy_from_user(&ktx, utx, sizeof(ktx))) - return -EFAULT; - - err = kc->clock_adj(which_clock, &ktx); - - if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) - return -EFAULT; - - return err; -} - -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec rtn_tp; - int error; - - if (!kc) - return -EINVAL; - - error = kc->clock_getres(which_clock, &rtn_tp); - - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) - error = -EFAULT; - - return error; -} - -/* - * nanosleep for monotonic and realtime clocks - */ -static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); -} - -SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) -{ - struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec t; - - if (!kc) - return -EINVAL; - if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; - - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - - if (!timespec_valid(&t)) - return -EINVAL; - - return kc->nsleep(which_clock, flags, &t, rmtp); -} - -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long clock_nanosleep_restart(struct restart_block *restart_block) -{ - clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); - - if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) - return -EINVAL; - - return kc->nsleep_restart(restart_block); -} -/* - * This file provides functions for block I/O operations on swap/file. - * - * Copyright (C) 1998,2001-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include - -#include "power.h" - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, struct block_device *bdev, sector_t sector, - struct page *page, struct bio **bio_chain) -{ - const int bio_rw = rw | REQ_SYNC; - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)sector); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} -/* - * Functions for saving/restoring console. - * - * Originally from swsusp. - */ - -#include -#include -#include -#include -#include "power.h" - -#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) - -static int orig_fgconsole, orig_kmsg; - -int pm_prepare_console(void) -{ - orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); - if (orig_fgconsole < 0) - return 1; - - orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); - return 0; -} - -void pm_restore_console(void) -{ - if (orig_fgconsole >= 0) { - vt_move_to_console(orig_fgconsole, 0); - vt_kmsg_redirect(orig_kmsg); - } -} -/* - * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek - * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - - -static int nocompress; -static int noresume; -static int resume_wait; -static int resume_delay; -static char resume_file[256] = CONFIG_PM_STD_PARTITION; -dev_t swsusp_resume_device; -sector_t swsusp_resume_block; -int in_suspend __nosavedata; - -enum { - HIBERNATION_INVALID, - HIBERNATION_PLATFORM, - HIBERNATION_SHUTDOWN, - HIBERNATION_REBOOT, - /* keep last */ - __HIBERNATION_AFTER_LAST -}; -#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) -#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) - -static int hibernation_mode = HIBERNATION_SHUTDOWN; - -bool freezer_test_done; - -static const struct platform_hibernation_ops *hibernation_ops; - -/** - * hibernation_set_ops - Set the global hibernate operations. - * @ops: Hibernation operations to use in subsequent hibernation transitions. - */ -void hibernation_set_ops(const struct platform_hibernation_ops *ops) -{ - if (ops && !(ops->begin && ops->end && ops->pre_snapshot - && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup && ops->leave)) { - WARN_ON(1); - return; - } - lock_system_sleep(); - hibernation_ops = ops; - if (ops) - hibernation_mode = HIBERNATION_PLATFORM; - else if (hibernation_mode == HIBERNATION_PLATFORM) - hibernation_mode = HIBERNATION_SHUTDOWN; - - unlock_system_sleep(); -} - -static bool entering_platform_hibernation; - -bool system_entering_hibernation(void) -{ - return entering_platform_hibernation; -} -EXPORT_SYMBOL(system_entering_hibernation); - -#ifdef CONFIG_PM_DEBUG -static void hibernation_debug_sleep(void) -{ - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); - mdelay(5000); -} - -static int hibernation_test(int level) -{ - if (pm_test_level == level) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} -#else /* !CONFIG_PM_DEBUG */ -static int hibernation_test(int level) { return 0; } -#endif /* !CONFIG_PM_DEBUG */ - -/** - * platform_begin - Call platform to start hibernation. - * @platform_mode: Whether or not to use the platform driver. - */ -static int platform_begin(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; -} - -/** - * platform_end - Call platform to finish transition to the working state. - * @platform_mode: Whether or not to use the platform driver. - */ -static void platform_end(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->end(); -} - -/** - * platform_pre_snapshot - Call platform to prepare the machine for hibernation. - * @platform_mode: Whether or not to use the platform driver. - * - * Use the platform driver to prepare the system for creating a hibernate image, - * if so configured, and return an error code if that fails. - */ - -static int platform_pre_snapshot(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_snapshot() : 0; -} - -/** - * platform_leave - Call platform to prepare a transition to the working state. - * @platform_mode: Whether or not to use the platform driver. - * - * Use the platform driver prepare to prepare the machine for switching to the - * normal mode of operation. - * - * This routine is called on one CPU with interrupts disabled. - */ -static void platform_leave(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->leave(); -} - -/** - * platform_finish - Call platform to switch the system to the working state. - * @platform_mode: Whether or not to use the platform driver. - * - * Use the platform driver to switch the machine to the normal mode of - * operation. - * - * This routine must be called after platform_prepare(). - */ -static void platform_finish(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->finish(); -} - -/** - * platform_pre_restore - Prepare for hibernate image restoration. - * @platform_mode: Whether or not to use the platform driver. - * - * Use the platform driver to prepare the system for resume from a hibernation - * image. - * - * If the restore fails after this function has been called, - * platform_restore_cleanup() must be called. - */ -static int platform_pre_restore(int platform_mode) -{ - return (platform_mode && hibernation_ops) ? - hibernation_ops->pre_restore() : 0; -} - -/** - * platform_restore_cleanup - Switch to the working state after failing restore. - * @platform_mode: Whether or not to use the platform driver. - * - * Use the platform driver to switch the system to the normal mode of operation - * after a failing restore. - * - * If platform_pre_restore() has been called before the failing restore, this - * function must be called too, regardless of the result of - * platform_pre_restore(). - */ -static void platform_restore_cleanup(int platform_mode) -{ - if (platform_mode && hibernation_ops) - hibernation_ops->restore_cleanup(); -} - -/** - * platform_recover - Recover from a failure to suspend devices. - * @platform_mode: Whether or not to use the platform driver. - */ -static void platform_recover(int platform_mode) -{ - if (platform_mode && hibernation_ops && hibernation_ops->recover) - hibernation_ops->recover(); -} - -/** - * swsusp_show_speed - Print time elapsed between two events during hibernation. - * @start: Starting event. - * @stop: Final event. - * @nr_pages: Number of memory pages processed between @start and @stop. - * @msg: Additional diagnostic message to print. - */ -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - -/** - * create_image - Create a hibernation image. - * @platform_mode: Whether or not to use the platform driver. - * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. - * - * Control reappears in this routine after the subsequent restore. - */ -static int create_image(int platform_mode) -{ - int error; - - error = dpm_suspend_noirq(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); - return error; - } - - error = platform_pre_snapshot(platform_mode); - if (error || hibernation_test(TEST_PLATFORM)) - goto Platform_finish; - - error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS)) - goto Enable_cpus; - - local_irq_disable(); - - error = syscore_suspend(); - if (error) { - printk(KERN_ERR "PM: Some system devices failed to power down, " - "aborting hibernation\n"); - goto Enable_irqs; - } - - if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) - goto Power_up; - - in_suspend = 1; - save_processor_state(); - error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); - /* Restore control flow magically appears here */ - restore_processor_state(); - if (!in_suspend) { - events_check_enabled = false; - platform_leave(platform_mode); - } - - Power_up: - syscore_resume(); - - Enable_irqs: - local_irq_enable(); - - Enable_cpus: - enable_nonboot_cpus(); - - Platform_finish: - platform_finish(platform_mode); - - dpm_resume_noirq(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - - return error; -} - -/** - * hibernation_snapshot - Quiesce devices and create a hibernation image. - * @platform_mode: If set, use platform driver to prepare for the transition. - * - * This routine must be called with pm_mutex held. - */ -int hibernation_snapshot(int platform_mode) -{ - pm_message_t msg; - int error; - - error = platform_begin(platform_mode); - if (error) - goto Close; - - /* Preallocate image memory before shutting down devices. */ - error = hibernate_preallocate_memory(); - if (error) - goto Close; - - error = freeze_kernel_threads(); - if (error) - goto Cleanup; - - if (hibernation_test(TEST_FREEZER)) { - - /* - * Indicate to the caller that we are returning due to a - * successful freezer test. - */ - freezer_test_done = true; - goto Cleanup; - } - - error = dpm_prepare(PMSG_FREEZE); - if (error) { - dpm_complete(PMSG_RECOVER); - goto Cleanup; - } - - suspend_console(); - pm_restrict_gfp_mask(); - - error = dpm_suspend(PMSG_FREEZE); - - if (error || hibernation_test(TEST_DEVICES)) - platform_recover(platform_mode); - else - error = create_image(platform_mode); - - /* - * In the case that we call create_image() above, the control - * returns here (1) after the image has been created or the - * image creation has failed and (2) after a successful restore. - */ - - /* We may need to release the preallocated image pages here. */ - if (error || !in_suspend) - swsusp_free(); - - msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; - dpm_resume(msg); - - if (error || !in_suspend) - pm_restore_gfp_mask(); - - resume_console(); - dpm_complete(msg); - - Close: - platform_end(platform_mode); - return error; - - Cleanup: - swsusp_free(); - goto Close; -} - -/** - * resume_target_kernel - Restore system state from a hibernation image. - * @platform_mode: Whether or not to use the platform driver. - * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. - */ -static int resume_target_kernel(bool platform_mode) -{ - int error; - - error = dpm_suspend_noirq(PMSG_QUIESCE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); - return error; - } - - error = platform_pre_restore(platform_mode); - if (error) - goto Cleanup; - - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - - local_irq_disable(); - - error = syscore_suspend(); - if (error) - goto Enable_irqs; - - save_processor_state(); - error = restore_highmem(); - if (!error) { - error = swsusp_arch_resume(); - /* - * The code below is only ever reached in case of a failure. - * Otherwise, execution continues at the place where - * swsusp_arch_suspend() was called. - */ - BUG_ON(!error); - /* - * This call to restore_highmem() reverts the changes made by - * the previous one. - */ - restore_highmem(); - } - /* - * The only reason why swsusp_arch_resume() can fail is memory being - * very tight, so we have to free it as soon as we can to avoid - * subsequent failures. - */ - swsusp_free(); - restore_processor_state(); - touch_softlockup_watchdog(); - - syscore_resume(); - - Enable_irqs: - local_irq_enable(); - - Enable_cpus: - enable_nonboot_cpus(); - - Cleanup: - platform_restore_cleanup(platform_mode); - - dpm_resume_noirq(PMSG_RECOVER); - - return error; -} - -/** - * hibernation_restore - Quiesce devices and restore from a hibernation image. - * @platform_mode: If set, use platform driver to prepare for the transition. - * - * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snapshot(). - */ -int hibernation_restore(int platform_mode) -{ - int error; - - pm_prepare_console(); - suspend_console(); - pm_restrict_gfp_mask(); - error = dpm_suspend_start(PMSG_QUIESCE); - if (!error) { - error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); - } - pm_restore_gfp_mask(); - resume_console(); - pm_restore_console(); - return error; -} - -/** - * hibernation_platform_enter - Power off the system using the platform driver. - */ -int hibernation_platform_enter(void) -{ - int error; - - if (!hibernation_ops) - return -ENOSYS; - - /* - * We have cancelled the power transition by running - * hibernation_ops->finish() before saving the image, so we should let - * the firmware know that we're going to enter the sleep state after all - */ - error = hibernation_ops->begin(); - if (error) - goto Close; - - entering_platform_hibernation = true; - suspend_console(); - error = dpm_suspend_start(PMSG_HIBERNATE); - if (error) { - if (hibernation_ops->recover) - hibernation_ops->recover(); - goto Resume_devices; - } - - error = dpm_suspend_noirq(PMSG_HIBERNATE); - if (error) - goto Resume_devices; - - error = hibernation_ops->prepare(); - if (error) - goto Platform_finish; - - error = disable_nonboot_cpus(); - if (error) - goto Platform_finish; - - local_irq_disable(); - syscore_suspend(); - if (pm_wakeup_pending()) { - error = -EAGAIN; - goto Power_up; - } - - hibernation_ops->enter(); - /* We should never get here */ - while (1); - - Power_up: - syscore_resume(); - local_irq_enable(); - enable_nonboot_cpus(); - - Platform_finish: - hibernation_ops->finish(); - - dpm_resume_noirq(PMSG_RESTORE); - - Resume_devices: - entering_platform_hibernation = false; - dpm_resume_end(PMSG_RESTORE); - resume_console(); - - Close: - hibernation_ops->end(); - - return error; -} - -/** - * power_down - Shut the machine down for hibernation. - * - * Use the platform driver, if configured, to put the system into the sleep - * state corresponding to hibernation, or try to power it off or reboot, - * depending on the value of hibernation_mode. - */ -static void power_down(void) -{ - switch (hibernation_mode) { - case HIBERNATION_REBOOT: - kernel_restart(NULL); - break; - case HIBERNATION_PLATFORM: - hibernation_platform_enter(); - case HIBERNATION_SHUTDOWN: - kernel_power_off(); - break; - } - kernel_halt(); - /* - * Valid image is on the disk, if we continue we risk serious data - * corruption after resume. - */ - printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); -} - -/** - * hibernate - Carry out system hibernation, including saving the image. - */ -int hibernate(void) -{ - int error; - - lock_system_sleep(); - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - goto Exit; - - error = usermodehelper_disable(); - if (error) - goto Exit; - - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = freeze_processes(); - if (error) - goto Finish; - - error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (error) - goto Thaw; - if (freezer_test_done) { - freezer_test_done = false; - goto Thaw; - } - - if (in_suspend) { - unsigned int flags = 0; - - if (hibernation_mode == HIBERNATION_PLATFORM) - flags |= SF_PLATFORM_MODE; - if (nocompress) - flags |= SF_NOCOMPRESS_MODE; - else - flags |= SF_CRC32_MODE; - - pr_debug("PM: writing image.\n"); - error = swsusp_write(flags); - swsusp_free(); - if (!error) - power_down(); - in_suspend = 0; - pm_restore_gfp_mask(); - } else { - pr_debug("PM: Image restored successfully.\n"); - } - - Thaw: - thaw_processes(); - Finish: - free_basic_memory_bitmaps(); - usermodehelper_enable(); - Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - Unlock: - unlock_system_sleep(); - return error; -} - - -/** - * software_resume - Resume from a saved hibernation image. - * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. - */ -static int software_resume(void) -{ - int error; - unsigned int flags; - - /* - * If the user said "noresume".. bail out early. - */ - if (noresume) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); - - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } - - pr_debug("PM: Checking hibernation image partition %s\n", resume_file); - - if (resume_delay) { - printk(KERN_INFO "Waiting %dsec before reading resume device...\n", - resume_delay); - ssleep(resume_delay); - } - - /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } - - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; - goto Unlock; - } - } - - Check_image: - pr_debug("PM: Hibernation image partition %d:%d present\n", - MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - - pr_debug("PM: Looking for hibernation image.\n"); - error = swsusp_check(); - if (error) - goto Unlock; - - /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - swsusp_close(FMODE_READ); - goto Unlock; - } - - pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - goto close_finish; - - error = usermodehelper_disable(); - if (error) - goto close_finish; - - error = create_basic_memory_bitmaps(); - if (error) { - usermodehelper_enable(); - goto close_finish; - } - - pr_debug("PM: Preparing processes for restore.\n"); - error = freeze_processes(); - if (error) { - swsusp_close(FMODE_READ); - goto Done; - } - - pr_debug("PM: Loading hibernation image.\n"); - - error = swsusp_read(&flags); - swsusp_close(FMODE_READ); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); - swsusp_free(); - thaw_processes(); - Done: - free_basic_memory_bitmaps(); - usermodehelper_enable(); - Finish: - pm_notifier_call_chain(PM_POST_RESTORE); - pm_restore_console(); - atomic_inc(&snapshot_device_available); - /* For success case, the suspend path will release the lock */ - Unlock: - mutex_unlock(&pm_mutex); - pr_debug("PM: Hibernation image not present or could not be loaded.\n"); - return error; -close_finish: - swsusp_close(FMODE_READ); - goto Finish; -} - -late_initcall(software_resume); - - -static const char * const hibernation_modes[] = { - [HIBERNATION_PLATFORM] = "platform", - [HIBERNATION_SHUTDOWN] = "shutdown", - [HIBERNATION_REBOOT] = "reboot", -}; - -/* - * /sys/power/disk - Control hibernation mode. - * - * Hibernation can be handled in several ways. There are a few different ways - * to put the system into the sleep state: using the platform driver (e.g. ACPI - * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly). - * - * The sysfs file /sys/power/disk provides an interface for selecting the - * hibernation mode to use. Reading from this file causes the available modes - * to be printed. There are 3 modes that can be supported: - * - * 'platform' - * 'shutdown' - * 'reboot' - * - * If a platform hibernation driver is in use, 'platform' will be supported - * and will be used by default. Otherwise, 'shutdown' will be used by default. - * The selected option (i.e. the one corresponding to the current value of - * hibernation_mode) is enclosed by a square bracket. - * - * To select a given hibernation mode it is necessary to write the mode's - * string representation (as returned by reading from /sys/power/disk) back - * into /sys/power/disk. - */ - -static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - int i; - char *start = buf; - - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!hibernation_modes[i]) - continue; - switch (i) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - break; - /* not a valid mode, continue with loop */ - continue; - } - if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); - else - buf += sprintf(buf, "%s ", hibernation_modes[i]); - } - buf += sprintf(buf, "\n"); - return buf-start; -} - -static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int error = 0; - int i; - int len; - char *p; - int mode = HIBERNATION_INVALID; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - lock_system_sleep(); - for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (len == strlen(hibernation_modes[i]) - && !strncmp(buf, hibernation_modes[i], len)) { - mode = i; - break; - } - } - if (mode != HIBERNATION_INVALID) { - switch (mode) { - case HIBERNATION_SHUTDOWN: - case HIBERNATION_REBOOT: - hibernation_mode = mode; - break; - case HIBERNATION_PLATFORM: - if (hibernation_ops) - hibernation_mode = mode; - else - error = -EINVAL; - } - } else - error = -EINVAL; - - if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", - hibernation_modes[mode]); - unlock_system_sleep(); - return error ? error : n; -} - -power_attr(disk); - -static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); -} - -static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; - - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; - - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; - - lock_system_sleep(); - swsusp_resume_device = res; - unlock_system_sleep(); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); - noresume = 0; - software_resume(); - ret = n; - out: - return ret; -} - -power_attr(resume); - -static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", image_size); -} - -static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - image_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(image_size); - -static ssize_t reserved_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", reserved_size); -} - -static ssize_t reserved_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long size; - - if (sscanf(buf, "%lu", &size) == 1) { - reserved_size = size; - return n; - } - - return -EINVAL; -} - -power_attr(reserved_size); - -static struct attribute * g[] = { - &disk_attr.attr, - &resume_attr.attr, - &image_size_attr.attr, - &reserved_size_attr.attr, - NULL, -}; - - -static struct attribute_group attr_group = { - .attrs = g, -}; - - -static int __init pm_disk_init(void) -{ - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_disk_init); - - -static int __init resume_setup(char *str) -{ - if (noresume) - return 1; - - strncpy( resume_file, str, 255 ); - return 1; -} - -static int __init resume_offset_setup(char *str) -{ - unsigned long long offset; - - if (noresume) - return 1; - - if (sscanf(str, "%llu", &offset) == 1) - swsusp_resume_block = offset; - - return 1; -} - -static int __init hibernate_setup(char *str) -{ - if (!strncmp(str, "noresume", 8)) - noresume = 1; - else if (!strncmp(str, "nocompress", 10)) - nocompress = 1; - return 1; -} - -static int __init noresume_setup(char *str) -{ - noresume = 1; - return 1; -} - -static int __init resumewait_setup(char *str) -{ - resume_wait = 1; - return 1; -} - -static int __init resumedelay_setup(char *str) -{ - resume_delay = simple_strtoul(str, NULL, 0); - return 1; -} - -__setup("noresume", noresume_setup); -__setup("resume_offset=", resume_offset_setup); -__setup("resume=", resume_setup); -__setup("hibernate=", hibernate_setup); -__setup("resumewait", resumewait_setup); -__setup("resumedelay=", resumedelay_setup); -/* - * kernel/power/main.c - PM subsystem core functionality. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -DEFINE_MUTEX(pm_mutex); - -#ifdef CONFIG_PM_SLEEP - -/* Routines for PM-transition notifications */ - -static BLOCKING_NOTIFIER_HEAD(pm_chain_head); - -int register_pm_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&pm_chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_pm_notifier); - -int unregister_pm_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&pm_chain_head, nb); -} -EXPORT_SYMBOL_GPL(unregister_pm_notifier); - -int pm_notifier_call_chain(unsigned long val) -{ - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); - - return notifier_to_errno(ret); -} - -/* If set, devices may be suspended and resumed asynchronously. */ -int pm_async_enabled = 1; - -static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", pm_async_enabled); -} - -static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned long val; - - if (strict_strtoul(buf, 10, &val)) - return -EINVAL; - - if (val > 1) - return -EINVAL; - - pm_async_enabled = val; - return n; -} - -power_attr(pm_async); - -#ifdef CONFIG_PM_DEBUG -int pm_test_level = TEST_NONE; - -static const char * const pm_tests[__TEST_AFTER_LAST] = { - [TEST_NONE] = "none", - [TEST_CORE] = "core", - [TEST_CPUS] = "processors", - [TEST_PLATFORM] = "platform", - [TEST_DEVICES] = "devices", - [TEST_FREEZER] = "freezer", -}; - -static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - char *s = buf; - int level; - - for (level = TEST_FIRST; level <= TEST_MAX; level++) - if (pm_tests[level]) { - if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); - else - s += sprintf(s, "%s ", pm_tests[level]); - } - - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; - - return (s - buf); -} - -static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - const char * const *s; - int level; - char *p; - int len; - int error = -EINVAL; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - lock_system_sleep(); - - level = TEST_FIRST; - for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { - pm_test_level = level; - error = 0; - break; - } - - unlock_system_sleep(); - - return error ? error : n; -} - -power_attr(pm_test); -#endif /* CONFIG_PM_DEBUG */ - -#ifdef CONFIG_DEBUG_FS -static char *suspend_step_name(enum suspend_stat_step step) -{ - switch (step) { - case SUSPEND_FREEZE: - return "freeze"; - case SUSPEND_PREPARE: - return "prepare"; - case SUSPEND_SUSPEND: - return "suspend"; - case SUSPEND_SUSPEND_NOIRQ: - return "suspend_noirq"; - case SUSPEND_RESUME_NOIRQ: - return "resume_noirq"; - case SUSPEND_RESUME: - return "resume"; - default: - return ""; - } -} - -static int suspend_stats_show(struct seq_file *s, void *unused) -{ - int i, index, last_dev, last_errno, last_step; - - last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; - last_dev %= REC_FAILED_NUM; - last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; - last_errno %= REC_FAILED_NUM; - last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; - last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", - "success", suspend_stats.success, - "fail", suspend_stats.fail, - "failed_freeze", suspend_stats.failed_freeze, - "failed_prepare", suspend_stats.failed_prepare, - "failed_suspend", suspend_stats.failed_suspend, - "failed_suspend_noirq", - suspend_stats.failed_suspend_noirq, - "failed_resume", suspend_stats.failed_resume, - "failed_resume_noirq", - suspend_stats.failed_resume_noirq); - seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", - suspend_stats.failed_devs[last_dev]); - for (i = 1; i < REC_FAILED_NUM; i++) { - index = last_dev + REC_FAILED_NUM - i; - index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-s\n", - suspend_stats.failed_devs[index]); - } - seq_printf(s, " last_failed_errno:\t%-d\n", - suspend_stats.errno[last_errno]); - for (i = 1; i < REC_FAILED_NUM; i++) { - index = last_errno + REC_FAILED_NUM - i; - index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-d\n", - suspend_stats.errno[index]); - } - seq_printf(s, " last_failed_step:\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[last_step])); - for (i = 1; i < REC_FAILED_NUM; i++) { - index = last_step + REC_FAILED_NUM - i; - index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[index])); - } - - return 0; -} - -static int suspend_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, suspend_stats_show, NULL); -} - -static const struct file_operations suspend_stats_operations = { - .open = suspend_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init pm_debugfs_init(void) -{ - debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, - NULL, NULL, &suspend_stats_operations); - return 0; -} - -late_initcall(pm_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ - -#endif /* CONFIG_PM_SLEEP */ - -struct kobject *power_kobj; - -/** - * state - control system power state. - * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). - * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. - */ -static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - char *s = buf; -#ifdef CONFIG_SUSPEND - int i; - - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } -#endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; -#endif - return (s - buf); -} - -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ -#ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; - const char * const *s; -#endif - char *p; - int len; - int error = -EINVAL; - - p = memchr(buf, '\n', n); - len = p ? p - buf : n; - - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } - -#ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) - break; - } - if (state < PM_SUSPEND_MAX && *s) { - error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else - suspend_stats.success++; - } -#endif - - Exit: - return error ? error : n; -} - -power_attr(state); - -#ifdef CONFIG_PM_SLEEP -/* - * The 'wakeup_count' attribute, along with the functions defined in - * drivers/base/power/wakeup.c, provides a means by which wakeup events can be - * handled in a non-racy way. - * - * If a wakeup event occurs when the system is in a sleep state, it simply is - * woken up. In turn, if an event that would wake the system up from a sleep - * state occurs when it is undergoing a transition to that sleep state, the - * transition should be aborted. Moreover, if such an event occurs when the - * system is in the working state, an attempt to start a transition to the - * given sleep state should fail during certain period after the detection of - * the event. Using the 'state' attribute alone is not sufficient to satisfy - * these requirements, because a wakeup event may occur exactly when 'state' - * is being written to and may be delivered to user space right before it is - * frozen, so the event will remain only partially processed until the system is - * woken up by another event. In particular, it won't cause the transition to - * a sleep state to be aborted. - * - * This difficulty may be overcome if user space uses 'wakeup_count' before - * writing to 'state'. It first should read from 'wakeup_count' and store - * the read value. Then, after carrying out its own preparations for the system - * transition to a sleep state, it should write the stored value to - * 'wakeup_count'. If that fails, at least one wakeup event has occurred since - * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it - * is allowed to write to 'state', but the transition will be aborted if there - * are any wakeup events detected after 'wakeup_count' was written to. - */ - -static ssize_t wakeup_count_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - unsigned int val; - - return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; -} - -static ssize_t wakeup_count_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - unsigned int val; - - if (sscanf(buf, "%u", &val) == 1) { - if (pm_save_wakeup_count(val)) - return n; - } - return -EINVAL; -} - -power_attr(wakeup_count); -#endif /* CONFIG_PM_SLEEP */ - -#ifdef CONFIG_PM_TRACE -int pm_trace_enabled; - -static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", pm_trace_enabled); -} - -static ssize_t -pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - int val; - - if (sscanf(buf, "%d", &val) == 1) { - pm_trace_enabled = !!val; - return n; - } - return -EINVAL; -} - -power_attr(pm_trace); - -static ssize_t pm_trace_dev_match_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return show_trace_dev_match(buf, PAGE_SIZE); -} - -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); - -#endif /* CONFIG_PM_TRACE */ - -static struct attribute * g[] = { - &state_attr.attr, -#ifdef CONFIG_PM_TRACE - &pm_trace_attr.attr, - &pm_trace_dev_match_attr.attr, -#endif -#ifdef CONFIG_PM_SLEEP - &pm_async_attr.attr, - &wakeup_count_attr.attr, -#ifdef CONFIG_PM_DEBUG - &pm_test_attr.attr, -#endif -#endif - NULL, -}; - -static struct attribute_group attr_group = { - .attrs = g, -}; - -#ifdef CONFIG_PM_RUNTIME -struct workqueue_struct *pm_wq; -EXPORT_SYMBOL_GPL(pm_wq); - -static int __init pm_start_workqueue(void) -{ - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); - - return pm_wq ? 0 : -ENOMEM; -} -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif - -static int __init pm_init(void) -{ - int error = pm_start_workqueue(); - if (error) - return error; - hibernate_image_size_init(); - hibernate_reserved_size_init(); - power_kobj = kobject_create_and_add("power", NULL); - if (!power_kobj) - return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); -} - -core_initcall(pm_init); -/* - * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * When the user hits Sys-Rq o to power down the machine this is the - * callback we use. - */ - -static void do_poweroff(struct work_struct *dummy) -{ - kernel_power_off(); -} - -static DECLARE_WORK(poweroff_work, do_poweroff); - -static void handle_poweroff(int key) -{ - /* run sysrq poweroff on boot cpu */ - schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); -} - -static struct sysrq_key_op sysrq_poweroff_op = { - .handler = handle_poweroff, - .help_msg = "powerOff", - .action_msg = "Power Off", - .enable_mask = SYSRQ_ENABLE_BOOT, -}; - -static int pm_sysrq_init(void) -{ - register_sysrq_key('o', &sysrq_poweroff_op); - return 0; -} - -subsys_initcall(pm_sysrq_init); -/* - * drivers/power/process.c - Functions for starting/stopping processes on - * suspend transitions. - * - * Originally from swsusp. - */ - - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Timeout for stopping processes - */ -#define TIMEOUT (20 * HZ) - -static int try_to_freeze_tasks(bool user_only) -{ - struct task_struct *g, *p; - unsigned long end_time; - unsigned int todo; - bool wq_busy = false; - struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; - bool wakeup = false; - - do_gettimeofday(&start); - - end_time = jiffies + TIMEOUT; - - if (!user_only) - freeze_workqueues_begin(); - - while (true) { - todo = 0; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (p == current || !freeze_task(p)) - continue; - - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) - todo++; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - - if (!user_only) { - wq_busy = freeze_workqueues_busy(); - todo += wq_busy; - } - - if (!todo || time_after(jiffies, end_time)) - break; - - if (pm_wakeup_pending()) { - wakeup = true; - break; - } - - /* - * We need to retry, but first give the freezing tasks some - * time to enter the regrigerator. - */ - msleep(10); - } - - do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; - - if (todo) { - printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", - wakeup ? "aborted" : "failed", - elapsed_csecs / 100, elapsed_csecs % 100, - todo - wq_busy, wq_busy); - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!wakeup && !freezer_should_skip(p) && - p != current && freezing(p) && !frozen(p)) - sched_show_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); - } - - return todo ? -EBUSY : 0; -} - -/** - * freeze_processes - Signal user space processes to enter the refrigerator. - * - * On success, returns 0. On failure, -errno and system is fully thawed. - */ -int freeze_processes(void) -{ - int error; - - if (!pm_freezing) - atomic_inc(&system_freezing_cnt); - - printk("Freezing user space processes ... "); - pm_freezing = true; - error = try_to_freeze_tasks(true); - if (!error) { - printk("done."); - oom_killer_disable(); - } - printk("\n"); - BUG_ON(in_atomic()); - - if (error) - thaw_processes(); - return error; -} - -/** - * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. - * - * On success, returns 0. On failure, -errno and only the kernel threads are - * thawed, so as to give a chance to the caller to do additional cleanups - * (if any) before thawing the userspace tasks. So, it is the responsibility - * of the caller to thaw the userspace tasks, when the time is right. - */ -int freeze_kernel_threads(void) -{ - int error; - - printk("Freezing remaining freezable tasks ... "); - pm_nosig_freezing = true; - error = try_to_freeze_tasks(false); - if (!error) - printk("done."); - - printk("\n"); - BUG_ON(in_atomic()); - - if (error) - thaw_kernel_threads(); - return error; -} - -void thaw_processes(void) -{ - struct task_struct *g, *p; - - if (pm_freezing) - atomic_dec(&system_freezing_cnt); - pm_freezing = false; - pm_nosig_freezing = false; - - oom_killer_enable(); - - printk("Restarting tasks ... "); - - thaw_workqueues(); - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - __thaw_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - - schedule(); - printk("done.\n"); -} - -void thaw_kernel_threads(void) -{ - struct task_struct *g, *p; - - pm_nosig_freezing = false; - printk("Restarting kernel threads ... "); - - thaw_workqueues(); - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) - __thaw_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - - schedule(); - printk("done.\n"); -} -/* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: - * - * Dependents on a QoS value : register requests - * Watchers of QoS value : get notified when target QoS value changes - * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. - * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) - * - * There are lists of pm_qos_objects each one wrapping requests, notifiers - * - * User mode requests on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * request is removed and a new qos target is computed. This way when the - * request that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross - */ - -/*#define DEBUG*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * locking rule: all changes to constraints or notifiers lists - * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock - * held, taken with _irqsave. One lock to rule them all - */ -struct pm_qos_object { - struct pm_qos_constraints *constraints; - struct miscdevice pm_qos_power_miscdev; - char *name; -}; - -static DEFINE_SPINLOCK(pm_qos_lock); - -static struct pm_qos_object null_pm_qos; - -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_constraints cpu_dma_constraints = { - .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), - .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &cpu_dma_lat_notifier, -}; -static struct pm_qos_object cpu_dma_pm_qos = { - .constraints = &cpu_dma_constraints, - .name = "cpu_dma_latency", -}; - -static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_constraints network_lat_constraints = { - .list = PLIST_HEAD_INIT(network_lat_constraints.list), - .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &network_lat_notifier, -}; -static struct pm_qos_object network_lat_pm_qos = { - .constraints = &network_lat_constraints, - .name = "network_latency", -}; - - -static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_constraints network_tput_constraints = { - .list = PLIST_HEAD_INIT(network_tput_constraints.list), - .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .type = PM_QOS_MAX, - .notifiers = &network_throughput_notifier, -}; -static struct pm_qos_object network_throughput_pm_qos = { - .constraints = &network_tput_constraints, - .name = "network_throughput", -}; - - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, - &network_lat_pm_qos, - &network_throughput_pm_qos -}; - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .read = pm_qos_power_read, - .open = pm_qos_power_open, - .release = pm_qos_power_release, - .llseek = noop_llseek, -}; - -/* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_constraints *c) -{ - if (plist_head_empty(&c->list)) - return c->default_value; - - switch (c->type) { - case PM_QOS_MIN: - return plist_first(&c->list)->prio; - - case PM_QOS_MAX: - return plist_last(&c->list)->prio; - - default: - /* runtime check for not using enum */ - BUG(); - } -} - -s32 pm_qos_read_value(struct pm_qos_constraints *c) -{ - return c->target_value; -} - -static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) -{ - c->target_value = value; -} - -/** - * pm_qos_update_target - manages the constraints list and calls the notifiers - * if needed - * @c: constraints data struct - * @node: request to add to the list, to update or to remove - * @action: action to take on the constraints list - * @value: value of the request to add or update - * - * This function returns 1 if the aggregated constraint value has changed, 0 - * otherwise. - */ -int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, - enum pm_qos_req_action action, int value) -{ - unsigned long flags; - int prev_value, curr_value, new_value; - - spin_lock_irqsave(&pm_qos_lock, flags); - prev_value = pm_qos_get_value(c); - if (value == PM_QOS_DEFAULT_VALUE) - new_value = c->default_value; - else - new_value = value; - - switch (action) { - case PM_QOS_REMOVE_REQ: - plist_del(node, &c->list); - break; - case PM_QOS_UPDATE_REQ: - /* - * to change the list, we atomically remove, reinit - * with new value and add, then see if the extremal - * changed - */ - plist_del(node, &c->list); - case PM_QOS_ADD_REQ: - plist_node_init(node, new_value); - plist_add(node, &c->list); - break; - default: - /* no action */ - ; - } - - curr_value = pm_qos_get_value(c); - pm_qos_set_value(c, curr_value); - - spin_unlock_irqrestore(&pm_qos_lock, flags); - - if (prev_value != curr_value) { - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - return 1; - } else { - return 0; - } -} - -/** - * pm_qos_request - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value. - */ -int pm_qos_request(int pm_qos_class) -{ - return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); -} -EXPORT_SYMBOL_GPL(pm_qos_request); - -int pm_qos_request_active(struct pm_qos_request *req) -{ - return req->pm_qos_class != 0; -} -EXPORT_SYMBOL_GPL(pm_qos_request_active); - -/** - * pm_qos_add_request - inserts new qos request into the list - * @req: pointer to a preallocated handle - * @pm_qos_class: identifies which list of qos request to use - * @value: defines the qos request - * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request - * handle. Caller needs to save this handle for later use in updates and - * removal. - */ - -void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) -{ - if (!req) /*guard against callers passing in null */ - return; - - if (pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); - return; - } - req->pm_qos_class = pm_qos_class; - pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); -} -EXPORT_SYMBOL_GPL(pm_qos_add_request); - -/** - * pm_qos_update_request - modifies an existing qos request - * @req : handle to list element holding a pm_qos request to use - * @value: defines the qos request - * - * Updates an existing qos request for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. - * - * Attempts are made to make this code callable on hot code paths. - */ -void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) -{ - if (!req) /*guard against callers passing in null */ - return; - - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); - return; - } - - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); -} -EXPORT_SYMBOL_GPL(pm_qos_update_request); - -/** - * pm_qos_remove_request - modifies an existing qos request - * @req: handle to request list element - * - * Will remove pm qos request from the list of constraints and - * recompute the current target value for the pm_qos_class. Call this - * on slow code paths. - */ -void pm_qos_remove_request(struct pm_qos_request *req) -{ - if (!req) /*guard against callers passing in null */ - return; - /* silent return to keep pcm code cleaner */ - - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); - return; - } - - pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, - PM_QOS_DEFAULT_VALUE); - memset(req, 0, sizeof(*req)); -} -EXPORT_SYMBOL_GPL(pm_qos_remove_request); - -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); - -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) -{ - int retval; - - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); - - return retval; -} -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); - -/* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos) -{ - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - return misc_register(&qos->pm_qos_power_miscdev); -} - -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; - - for (pm_qos_class = 0; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; - } - return -1; -} - -static int pm_qos_power_open(struct inode *inode, struct file *filp) -{ - long pm_qos_class; - - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= 0) { - struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return -ENOMEM; - - pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); - filp->private_data = req; - - return 0; - } - return -EPERM; -} - -static int pm_qos_power_release(struct inode *inode, struct file *filp) -{ - struct pm_qos_request *req; - - req = filp->private_data; - pm_qos_remove_request(req); - kfree(req); - - return 0; -} - - -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos) -{ - s32 value; - unsigned long flags; - struct pm_qos_request *req = filp->private_data; - - if (!req) - return -EINVAL; - if (!pm_qos_request_active(req)) - return -EINVAL; - - spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); -} - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) -{ - s32 value; - struct pm_qos_request *req; - - if (count == sizeof(s32)) { - if (copy_from_user(&value, buf, sizeof(s32))) - return -EFAULT; - } else if (count <= 11) { /* ASCII perhaps? */ - char ascii_value[11]; - unsigned long int ulval; - int ret; - - if (copy_from_user(ascii_value, buf, count)) - return -EFAULT; - - if (count > 10) { - if (ascii_value[10] == '\n') - ascii_value[10] = '\0'; - else - return -EINVAL; - } else { - ascii_value[count] = '\0'; - } - ret = strict_strtoul(ascii_value, 16, &ulval); - if (ret) { - pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); - return -EINVAL; - } - value = (s32)lower_32_bits(ulval); - } else { - return -EINVAL; - } - - req = filp->private_data; - pm_qos_update_request(req, value); - - return count; -} - - -static int __init pm_qos_power_init(void) -{ - int ret = 0; - - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); - - return ret; -} - -late_initcall(pm_qos_power_init); -/* - * linux/kernel/power/snapshot.c - * - * This file provides system snapshot/restore functionality for swsusp. - * - * Copyright (C) 1998-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "power.h" - -static int swsusp_page_is_free(struct page *); -static void swsusp_set_page_forbidden(struct page *); -static void swsusp_unset_page_forbidden(struct page *); - -/* - * Number of bytes to reserve for memory allocations made by device drivers - * from their ->freeze() and ->freeze_noirq() callbacks so that they don't - * cause image creation to fail (tunable via /sys/power/reserved_size). - */ -unsigned long reserved_size; - -void __init hibernate_reserved_size_init(void) -{ - reserved_size = SPARE_PAGES * PAGE_SIZE; -} - -/* - * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. - */ -unsigned long image_size; - -void __init hibernate_image_size_init(void) -{ - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; -} - -/* List of PBEs needed for restoring the pages that were allocated before - * the suspend and included in the suspend image, but have also been - * allocated by the "resume" kernel, so their contents cannot be written - * directly to their "original" page frames. - */ -struct pbe *restore_pblist; - -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; - -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. - */ - -#define PG_ANY 0 -#define PG_SAFE 1 -#define PG_UNSAFE_CLEAR 1 -#define PG_UNSAFE_KEEP 0 - -static unsigned int allocated_unsafe_pages; - -static void *get_image_page(gfp_t gfp_mask, int safe_needed) -{ - void *res; - - res = (void *)get_zeroed_page(gfp_mask); - if (safe_needed) - while (res && swsusp_page_is_free(virt_to_page(res))) { - /* The page is unsafe, mark it for swsusp_free() */ - swsusp_set_page_forbidden(virt_to_page(res)); - allocated_unsafe_pages++; - res = (void *)get_zeroed_page(gfp_mask); - } - if (res) { - swsusp_set_page_forbidden(virt_to_page(res)); - swsusp_set_page_free(virt_to_page(res)); - } - return res; -} - -unsigned long get_safe_page(gfp_t gfp_mask) -{ - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); -} - -static struct page *alloc_image_page(gfp_t gfp_mask) -{ - struct page *page; - - page = alloc_page(gfp_mask); - if (page) { - swsusp_set_page_forbidden(page); - swsusp_set_page_free(page); - } - return page; -} - -/** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) - */ - -static inline void free_image_page(void *addr, int clear_nosave_free) -{ - struct page *page; - - BUG_ON(!virt_addr_valid(addr)); - - page = virt_to_page(addr); - - swsusp_unset_page_forbidden(page); - if (clear_nosave_free) - swsusp_unset_page_free(page); - - __free_page(page); -} - -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) -{ - while (list) { - struct linked_page *lp = list->next; - - free_image_page(list, clear_page_nosave); - list = lp; - } -} - -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - -struct chain_allocator { - struct linked_page *chain; /* the chain */ - unsigned int used_space; /* total size of objects allocated out - * of the current page - */ - gfp_t gfp_mask; /* mask for allocating pages */ - int safe_needed; /* if set, only "safe" pages are allocated */ -}; - -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) -{ - ca->chain = NULL; - ca->used_space = LINKED_PAGE_DATA_SIZE; - ca->gfp_mask = gfp_mask; - ca->safe_needed = safe_needed; -} - -static void *chain_alloc(struct chain_allocator *ca, unsigned int size) -{ - void *ret; - - if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { - struct linked_page *lp; - - lp = get_image_page(ca->gfp_mask, ca->safe_needed); - if (!lp) - return NULL; - - lp->next = ca->chain; - ca->chain = lp; - ca->used_space = 0; - } - ret = ca->chain->data + ca->used_space; - ca->used_space += size; - return ret; -} - -/** - * Data types related to memory bitmaps. - * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. - * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. - * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). - * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. - * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. - */ - -#define BM_END_OF_MAP (~0UL) - -#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) - -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ -}; - -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} - -/* strcut bm_position is used for browsing memory bitmaps */ - -struct bm_position { - struct bm_block *block; - int bit; -}; - -struct memory_bitmap { - struct list_head blocks; /* list of bitmap blocks */ - struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ - struct bm_position cur; /* most recently used bit position */ -}; - -/* Functions that operate on memory bitmaps */ - -static void memory_bm_position_reset(struct memory_bitmap *bm) -{ - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; -} - -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); - -/** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory - */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) -{ - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); - - while (nr_blocks-- > 0) { - struct bm_block *bb; - - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) - return -ENOMEM; - list_add(&bb->hook, list); - } - - return 0; -} - -struct mem_extent { - struct list_head hook; - unsigned long start; - unsigned long end; -}; - -/** - * free_mem_extents - free a list of memory extents - * @list - list of extents to empty - */ -static void free_mem_extents(struct list_head *list) -{ - struct mem_extent *ext, *aux; - - list_for_each_entry_safe(ext, aux, list, hook) { - list_del(&ext->hook); - kfree(ext); - } -} - -/** - * create_mem_extents - create a list of memory extents representing - * contiguous ranges of PFNs - * @list - list to put the extents into - * @gfp_mask - mask to use for memory allocations - */ -static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) -{ - struct zone *zone; - - INIT_LIST_HEAD(list); - - for_each_populated_zone(zone) { - unsigned long zone_start, zone_end; - struct mem_extent *ext, *cur, *aux; - - zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; - - list_for_each_entry(ext, list, hook) - if (zone_start <= ext->end) - break; - - if (&ext->hook == list || zone_end < ext->start) { - /* New extent is necessary */ - struct mem_extent *new_ext; - - new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); - if (!new_ext) { - free_mem_extents(list); - return -ENOMEM; - } - new_ext->start = zone_start; - new_ext->end = zone_end; - list_add_tail(&new_ext->hook, &ext->hook); - continue; - } - - /* Merge this zone's range of PFNs with the existing one */ - if (zone_start < ext->start) - ext->start = zone_start; - if (zone_end > ext->end) - ext->end = zone_end; - - /* More merging may be possible */ - cur = ext; - list_for_each_entry_safe_continue(cur, aux, list, hook) { - if (zone_end < cur->start) - break; - if (zone_end < cur->end) - ext->end = cur->end; - list_del(&cur->hook); - kfree(cur); - } - } - - return 0; -} - -/** - * memory_bm_create - allocate memory for a memory bitmap - */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) -{ - struct chain_allocator ca; - struct list_head mem_extents; - struct mem_extent *ext; - int error; - - chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); - - error = create_mem_extents(&mem_extents, gfp_mask); - if (error) - return error; - - list_for_each_entry(ext, &mem_extents, hook) { - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; - - bb = list_entry(bm->blocks.prev, struct bm_block, hook); - - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) - goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; - } - } - - bm->p_list = ca.chain; - memory_bm_position_reset(bm); - Exit: - free_mem_extents(&mem_extents); - return error; - - Error: - bm->p_list = ca.chain; - memory_bm_free(bm, PG_UNSAFE_CLEAR); - goto Exit; -} - -/** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) -{ - struct bm_block *bb; - - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - - free_list_of_pages(bm->p_list, clear_nosave_free); - - INIT_LIST_HEAD(&bm->blocks); -} - -/** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. - */ -static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) -{ - struct bm_block *bb; - - /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. - */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; - - if (&bb->hook == &bm->blocks) - return -EFAULT; - - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; - return 0; -} - -static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - set_bit(bit, addr); -} - -static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - if (!error) - set_bit(bit, addr); - return error; -} - -static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - clear_bit(bit, addr); -} - -static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - int error; - - error = memory_bm_find_bit(bm, pfn, &addr, &bit); - BUG_ON(error); - return test_bit(bit, addr); -} - -static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) -{ - void *addr; - unsigned int bit; - - return !memory_bm_find_bit(bm, pfn, &addr, &bit); -} - -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. - * - * It is required to run memory_bm_position_reset() before the first call to - * this function. - */ - -static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) -{ - struct bm_block *bb; - int bit; - - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); - - memory_bm_position_reset(bm); - return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; -} - -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. - */ - -struct nosave_region { - struct list_head list; - unsigned long start_pfn; - unsigned long end_pfn; -}; - -static LIST_HEAD(nosave_regions); - -/** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) - */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) -{ - struct nosave_region *region; - - if (start_pfn >= end_pfn) - return; - - if (!list_empty(&nosave_regions)) { - /* Try to extend the previous region (they should be sorted) */ - region = list_entry(nosave_regions.prev, - struct nosave_region, list); - if (region->end_pfn == start_pfn) { - region->end_pfn = end_pfn; - goto Report; - } - } - if (use_kmalloc) { - /* during init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else - /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); - region->start_pfn = start_pfn; - region->end_pfn = end_pfn; - list_add_tail(®ion->list, &nosave_regions); - Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); -} - -/* - * Set bits in this map correspond to the page frames the contents of which - * should not be saved during the suspend. - */ -static struct memory_bitmap *forbidden_pages_map; - -/* Set bits in this map correspond to free page frames. */ -static struct memory_bitmap *free_pages_map; - -/* - * Each page frame allocated for creating the image is marked by setting the - * corresponding bits in forbidden_pages_map and free_pages_map simultaneously - */ - -void swsusp_set_page_free(struct page *page) -{ - if (free_pages_map) - memory_bm_set_bit(free_pages_map, page_to_pfn(page)); -} - -static int swsusp_page_is_free(struct page *page) -{ - return free_pages_map ? - memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; -} - -void swsusp_unset_page_free(struct page *page) -{ - if (free_pages_map) - memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); -} - -static void swsusp_set_page_forbidden(struct page *page) -{ - if (forbidden_pages_map) - memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); -} - -int swsusp_page_is_forbidden(struct page *page) -{ - return forbidden_pages_map ? - memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; -} - -static void swsusp_unset_page_forbidden(struct page *page) -{ - if (forbidden_pages_map) - memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); -} - -/** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. - */ - -static void mark_nosave_pages(struct memory_bitmap *bm) -{ - struct nosave_region *region; - - if (list_empty(&nosave_regions)) - return; - - list_for_each_entry(region, &nosave_regions, list) { - unsigned long pfn; - - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); - - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } - } -} - -/** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. - */ - -int create_basic_memory_bitmaps(void) -{ - struct memory_bitmap *bm1, *bm2; - int error = 0; - - BUG_ON(forbidden_pages_map || free_pages_map); - - bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm1) - return -ENOMEM; - - error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); - if (error) - goto Free_first_object; - - bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); - if (!bm2) - goto Free_first_bitmap; - - error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); - if (error) - goto Free_second_object; - - forbidden_pages_map = bm1; - free_pages_map = bm2; - mark_nosave_pages(forbidden_pages_map); - - pr_debug("PM: Basic memory bitmaps created\n"); - - return 0; - - Free_second_object: - kfree(bm2); - Free_first_bitmap: - memory_bm_free(bm1, PG_UNSAFE_CLEAR); - Free_first_object: - kfree(bm1); - return -ENOMEM; -} - -/** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. - */ - -void free_basic_memory_bitmaps(void) -{ - struct memory_bitmap *bm1, *bm2; - - BUG_ON(!(forbidden_pages_map && free_pages_map)); - - bm1 = forbidden_pages_map; - bm2 = free_pages_map; - forbidden_pages_map = NULL; - free_pages_map = NULL; - memory_bm_free(bm1, PG_UNSAFE_CLEAR); - kfree(bm1); - memory_bm_free(bm2, PG_UNSAFE_CLEAR); - kfree(bm2); - - pr_debug("PM: Basic memory bitmaps freed\n"); -} - -/** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) - */ - -unsigned int snapshot_additional_pages(struct zone *zone) -{ - unsigned int res; - - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); - return 2 * res; -} - -#ifdef CONFIG_HIGHMEM -/** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. - */ - -static unsigned int count_free_highmem_pages(void) -{ - struct zone *zone; - unsigned int cnt = 0; - - for_each_populated_zone(zone) - if (is_highmem(zone)) - cnt += zone_page_state(zone, NR_FREE_PAGES); - - return cnt; -} - -/** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. - * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. - */ -static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) - return NULL; - - BUG_ON(!PageHighMem(page)); - - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) - return NULL; - - if (page_is_guard(page)) - return NULL; - - return page; -} - -/** - * count_highmem_pages - compute the total number of saveable highmem - * pages. - */ - -static unsigned int count_highmem_pages(void) -{ - struct zone *zone; - unsigned int n = 0; - - for_each_populated_zone(zone) { - unsigned long pfn, max_zone_pfn; - - if (!is_highmem(zone)) - continue; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(zone, pfn)) - n++; - } - return n; -} -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} -#endif /* CONFIG_HIGHMEM */ - -/** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. - * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. - */ -static struct page *saveable_page(struct zone *zone, unsigned long pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (page_zone(page) != zone) - return NULL; - - BUG_ON(PageHighMem(page)); - - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) - return NULL; - - if (PageReserved(page) - && (!kernel_page_present(page) || pfn_is_nosave(pfn))) - return NULL; - - if (page_is_guard(page)) - return NULL; - - return page; -} - -/** - * count_data_pages - compute the total number of saveable non-highmem - * pages. - */ - -static unsigned int count_data_pages(void) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - unsigned int n = 0; - - for_each_populated_zone(zone) { - if (is_highmem(zone)) - continue; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_page(zone, pfn)) - n++; - } - return n; -} - -/* This is needed, because copy_page and memcpy are not usable for copying - * task structs. - */ -static inline void do_copy_page(long *dst, long *src) -{ - int n; - - for (n = PAGE_SIZE / sizeof(long); n; n--) - *dst++ = *src++; -} - - -/** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). - */ -static void safe_copy_page(void *dst, struct page *s_page) -{ - if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); - } else { - kernel_map_pages(s_page, 1, 1); - do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); - } -} - - -#ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) -{ - return is_highmem(zone) ? - saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); -} - -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) -{ - struct page *s_page, *d_page; - void *src, *dst; - - s_page = pfn_to_page(src_pfn); - d_page = pfn_to_page(dst_pfn); - if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); - do_copy_page(dst, src); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); - } else { - if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel - * data modified by kmap_atomic() - */ - safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page, KM_USER0); - copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); - } else { - safe_copy_page(page_address(d_page), s_page); - } - } -} -#else -#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) - -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) -{ - safe_copy_page(page_address(pfn_to_page(dst_pfn)), - pfn_to_page(src_pfn)); -} -#endif /* CONFIG_HIGHMEM */ - -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) -{ - struct zone *zone; - unsigned long pfn; - - for_each_populated_zone(zone) { - unsigned long max_zone_pfn; - - mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (page_is_saveable(zone, pfn)) - memory_bm_set_bit(orig_bm, pfn); - } - memory_bm_position_reset(orig_bm); - memory_bm_position_reset(copy_bm); - for(;;) { - pfn = memory_bm_next_pfn(orig_bm); - if (unlikely(pfn == BM_END_OF_MAP)) - break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); - } -} - -/* Total number of image pages */ -static unsigned int nr_copy_pages; -/* Number of pages needed for saving the original pfns of the image pages */ -static unsigned int nr_meta_pages; -/* - * Numbers of normal and highmem page frames allocated for hibernation image - * before suspending devices. - */ -unsigned int alloc_normal, alloc_highmem; -/* - * Memory bitmap used for marking saveable pages (during hibernation) or - * hibernation image pages (during restore) - */ -static struct memory_bitmap orig_bm; -/* - * Memory bitmap used during hibernation for marking allocated page frames that - * will contain copies of saveable pages. During restore it is initially used - * for marking hibernation image pages, but then the set bits from it are - * duplicated in @orig_bm and it is released. On highmem systems it is next - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - -/** - * swsusp_free - free pages allocated for the suspend. - * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. - */ - -void swsusp_free(void) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - - for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } - } - nr_copy_pages = 0; - nr_meta_pages = 0; - restore_pblist = NULL; - buffer = NULL; - alloc_normal = 0; - alloc_highmem = 0; -} - -/* Helper functions used for the shrinking of memory. */ - -#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) - -/** - * preallocate_image_pages - Allocate a number of pages for hibernation image - * @nr_pages: Number of page frames to allocate. - * @mask: GFP flags to use for the allocation. - * - * Return value: Number of page frames actually allocated - */ -static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) -{ - unsigned long nr_alloc = 0; - - while (nr_pages > 0) { - struct page *page; - - page = alloc_image_page(mask); - if (!page) - break; - memory_bm_set_bit(©_bm, page_to_pfn(page)); - if (PageHighMem(page)) - alloc_highmem++; - else - alloc_normal++; - nr_pages--; - nr_alloc++; - } - - return nr_alloc; -} - -static unsigned long preallocate_image_memory(unsigned long nr_pages, - unsigned long avail_normal) -{ - unsigned long alloc; - - if (avail_normal <= alloc_normal) - return 0; - - alloc = avail_normal - alloc_normal; - if (nr_pages < alloc) - alloc = nr_pages; - - return preallocate_image_pages(alloc, GFP_IMAGE); -} - -#ifdef CONFIG_HIGHMEM -static unsigned long preallocate_image_highmem(unsigned long nr_pages) -{ - return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); -} - -/** - * __fraction - Compute (an approximation of) x * (multiplier / base) - */ -static unsigned long __fraction(u64 x, u64 multiplier, u64 base) -{ - x *= multiplier; - do_div(x, base); - return (unsigned long)x; -} - -static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) -{ - unsigned long alloc = __fraction(nr_pages, highmem, total); - - return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); -} -#else /* CONFIG_HIGHMEM */ -static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) -{ - return 0; -} - -static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) -{ - return 0; -} -#endif /* CONFIG_HIGHMEM */ - -/** - * free_unnecessary_pages - Release preallocated pages not needed for the image - */ -static void free_unnecessary_pages(void) -{ - unsigned long save, to_free_normal, to_free_highmem; - - save = count_data_pages(); - if (alloc_normal >= save) { - to_free_normal = alloc_normal - save; - save = 0; - } else { - to_free_normal = 0; - save -= alloc_normal; - } - save += count_highmem_pages(); - if (alloc_highmem >= save) { - to_free_highmem = alloc_highmem - save; - } else { - to_free_highmem = 0; - save -= alloc_highmem; - if (to_free_normal > save) - to_free_normal -= save; - else - to_free_normal = 0; - } - - memory_bm_position_reset(©_bm); - - while (to_free_normal > 0 || to_free_highmem > 0) { - unsigned long pfn = memory_bm_next_pfn(©_bm); - struct page *page = pfn_to_page(pfn); - - if (PageHighMem(page)) { - if (!to_free_highmem) - continue; - to_free_highmem--; - alloc_highmem--; - } else { - if (!to_free_normal) - continue; - to_free_normal--; - alloc_normal--; - } - memory_bm_clear_bit(©_bm, pfn); - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } -} - -/** - * minimum_image_size - Estimate the minimum acceptable size of an image - * @saveable: Number of saveable pages in the system. - * - * We want to avoid attempting to free too much memory too hard, so estimate the - * minimum acceptable size of a hibernation image to use as the lower limit for - * preallocating memory. - * - * We assume that the minimum image size should be proportional to - * - * [number of saveable pages] - [number of pages that can be freed in theory] - * - * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, - * minus mapped file pages. - */ -static unsigned long minimum_image_size(unsigned long saveable) -{ - unsigned long size; - - size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); - - return saveable <= size ? 0 : saveable - size; -} - -/** - * hibernate_preallocate_memory - Preallocate memory for hibernation image - * - * To create a hibernation image it is necessary to make a copy of every page - * frame in use. We also need a number of page frames to be free during - * hibernation for allocations made while saving the image and for device - * drivers, in case they need to allocate memory from their hibernation - * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through - * /sys/power/reserved_size, respectively). To make this happen, we compute the - * total number of available page frames and allocate at least - * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 - * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) - * - * of them, which corresponds to the maximum size of a hibernation image. - * - * If image_size is set below the number following from the above formula, - * the preallocation of memory is continued until the total number of saveable - * pages in the system is below the requested image size or the minimum - * acceptable image size returned by minimum_image_size(), whichever is greater. - */ -int hibernate_preallocate_memory(void) -{ - struct zone *zone; - unsigned long saveable, size, max_size, count, highmem, pages = 0; - unsigned long alloc, save_highmem, pages_highmem, avail_normal; - struct timeval start, stop; - int error; - - printk(KERN_INFO "PM: Preallocating image memory... "); - do_gettimeofday(&start); - - error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); - if (error) - goto err_out; - - error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); - if (error) - goto err_out; - - alloc_normal = 0; - alloc_highmem = 0; - - /* Count the number of saveable data pages. */ - save_highmem = count_highmem_pages(); - saveable = count_data_pages(); - - /* - * Compute the total number of page frames we can use (count) and the - * number of pages needed for image metadata (size). - */ - count = saveable; - saveable += save_highmem; - highmem = save_highmem; - size = 0; - for_each_populated_zone(zone) { - size += snapshot_additional_pages(zone); - if (is_highmem(zone)) - highmem += zone_page_state(zone, NR_FREE_PAGES); - else - count += zone_page_state(zone, NR_FREE_PAGES); - } - avail_normal = count; - count += highmem; - count -= totalreserve_pages; - - /* Add number of pages required for page keys (s390 only). */ - size += page_key_additional_pages(saveable); - - /* Compute the maximum number of saveable pages to leave in memory. */ - max_size = (count - (size + PAGES_FOR_IO)) / 2 - - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); - /* Compute the desired number of image pages specified by image_size. */ - size = DIV_ROUND_UP(image_size, PAGE_SIZE); - if (size > max_size) - size = max_size; - /* - * If the desired number of image pages is at least as large as the - * current number of saveable pages in memory, allocate page frames for - * the image and we're done. - */ - if (size >= saveable) { - pages = preallocate_image_highmem(save_highmem); - pages += preallocate_image_memory(saveable - pages, avail_normal); - goto out; - } - - /* Estimate the minimum size of the image. */ - pages = minimum_image_size(saveable); - /* - * To avoid excessive pressure on the normal zone, leave room in it to - * accommodate an image of the minimum size (unless it's already too - * small, in which case don't preallocate pages from it at all). - */ - if (avail_normal > pages) - avail_normal -= pages; - else - avail_normal = 0; - if (size < pages) - size = min_t(unsigned long, pages, max_size); - - /* - * Let the memory management subsystem know that we're going to need a - * large number of page frames to allocate and make it free some memory. - * NOTE: If this is not done, performance will be hurt badly in some - * test cases. - */ - shrink_all_memory(saveable - size); - - /* - * The number of saveable pages in memory was too high, so apply some - * pressure to decrease it. First, make room for the largest possible - * image and fail if that doesn't work. Next, try to decrease the size - * of the image as much as indicated by 'size' using allocations from - * highmem and non-highmem zones separately. - */ - pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = (count - max_size) - pages_highmem; - pages = preallocate_image_memory(alloc, avail_normal); - if (pages < alloc) { - /* We have exhausted non-highmem pages, try highmem. */ - alloc -= pages; - pages += pages_highmem; - pages_highmem = preallocate_image_highmem(alloc); - if (pages_highmem < alloc) - goto err_out; - pages += pages_highmem; - /* - * size is the desired number of saveable pages to leave in - * memory, so try to preallocate (all memory - size) pages. - */ - alloc = (count - pages) - size; - pages += preallocate_image_highmem(alloc); - } else { - /* - * There are approximately max_size saveable pages at this point - * and we want to reduce this number down to size. - */ - alloc = max_size - size; - size = preallocate_highmem_fraction(alloc, highmem, count); - pages_highmem += size; - alloc -= size; - size = preallocate_image_memory(alloc, avail_normal); - pages_highmem += preallocate_image_highmem(alloc - size); - pages += pages_highmem + size; - } - - /* - * We only need as many page frames for the image as there are saveable - * pages in memory, but we have allocated more. Release the excessive - * ones now. - */ - free_unnecessary_pages(); - - out: - do_gettimeofday(&stop); - printk(KERN_CONT "done (allocated %lu pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Allocated"); - - return 0; - - err_out: - printk(KERN_CONT "\n"); - swsusp_free(); - return -ENOMEM; -} - -#ifdef CONFIG_HIGHMEM -/** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - -static unsigned int count_pages_for_highmem(unsigned int nr_highmem) -{ - unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; - - if (free_highmem >= nr_highmem) - nr_highmem = 0; - else - nr_highmem -= free_highmem; - - return nr_highmem; -} -#else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } -#endif /* CONFIG_HIGHMEM */ - -/** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. - */ - -static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) -{ - struct zone *zone; - unsigned int free = alloc_normal; - - for_each_populated_zone(zone) - if (!is_highmem(zone)) - free += zone_page_state(zone, NR_FREE_PAGES); - - nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, free); - - return free > nr_pages + PAGES_FOR_IO; -} - -#ifdef CONFIG_HIGHMEM -/** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. - */ - -static inline int get_highmem_buffer(int safe_needed) -{ - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); - return buffer ? 0 : -ENOMEM; -} - -/** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. - */ - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) -{ - unsigned int to_alloc = count_free_highmem_pages(); - - if (to_alloc > nr_highmem) - to_alloc = nr_highmem; - - nr_highmem -= to_alloc; - while (to_alloc-- > 0) { - struct page *page; - - page = alloc_image_page(__GFP_HIGHMEM); - memory_bm_set_bit(bm, page_to_pfn(page)); - } - return nr_highmem; -} -#else -static inline int get_highmem_buffer(int safe_needed) { return 0; } - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } -#endif /* CONFIG_HIGHMEM */ - -/** - * swsusp_alloc - allocate memory for the suspend image - * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. - * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. - */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) -{ - if (nr_highmem > 0) { - if (get_highmem_buffer(PG_ANY)) - goto err_out; - if (nr_highmem > alloc_highmem) { - nr_highmem -= alloc_highmem; - nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); - } - } - if (nr_pages > alloc_normal) { - nr_pages -= alloc_normal; - while (nr_pages-- > 0) { - struct page *page; - - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - if (!page) - goto err_out; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); - } - } - - return 0; - - err_out: - swsusp_free(); - return -ENOMEM; -} - -asmlinkage int swsusp_save(void) -{ - unsigned int nr_pages, nr_highmem; - - printk(KERN_INFO "PM: Creating hibernation image:\n"); - - drain_local_pages(NULL); - nr_pages = count_data_pages(); - nr_highmem = count_highmem_pages(); - printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); - - if (!enough_free_mem(nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Not enough free memory\n"); - return -ENOMEM; - } - - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Memory allocation failed\n"); - return -ENOMEM; - } - - /* During allocating of suspend pagedir, new cold pages may appear. - * Kill them. - */ - drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); - - /* - * End of critical section. From now on, we can write to memory, - * but we should not touch disk. This specially means we must _not_ - * touch swap space! Except we must write out our image of course. - */ - - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; - nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - - printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", - nr_pages); - - return 0; -} - -#ifndef CONFIG_ARCH_HIBERNATION_HEADER -static int init_header_complete(struct swsusp_info *info) -{ - memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); - info->version_code = LINUX_VERSION_CODE; - return 0; -} - -static char *check_image_kernel(struct swsusp_info *info) -{ - if (info->version_code != LINUX_VERSION_CODE) - return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) - return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) - return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) - return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) - return "machine"; - return NULL; -} -#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ - -unsigned long snapshot_get_image_size(void) -{ - return nr_copy_pages + nr_meta_pages + 1; -} - -static int init_header(struct swsusp_info *info) -{ - memset(info, 0, sizeof(struct swsusp_info)); - info->num_physpages = num_physpages; - info->image_pages = nr_copy_pages; - info->pages = snapshot_get_image_size(); - info->size = info->pages; - info->size <<= PAGE_SHIFT; - return init_header_complete(info); -} - -/** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) - */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - buf[j] = memory_bm_next_pfn(bm); - if (unlikely(buf[j] == BM_END_OF_MAP)) - break; - /* Save page key for data page (s390 only). */ - page_key_read(buf + j); - } -} - -/** - * snapshot_read_next - used for reading the system memory snapshot. - * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. - * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. - * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. - */ - -int snapshot_read_next(struct snapshot_handle *handle) -{ - if (handle->cur > nr_meta_pages + nr_copy_pages) - return 0; - - if (!buffer) { - /* This makes the buffer be freed by swsusp_free() */ - buffer = get_image_page(GFP_ATOMIC, PG_ANY); - if (!buffer) - return -ENOMEM; - } - if (!handle->cur) { - int error; - - error = init_header((struct swsusp_info *)buffer); - if (error) - return error; - handle->buffer = buffer; - memory_bm_position_reset(&orig_bm); - memory_bm_position_reset(©_bm); - } else if (handle->cur <= nr_meta_pages) { - clear_page(buffer); - pack_pfns(buffer, &orig_bm); - } else { - struct page *page; - - page = pfn_to_page(memory_bm_next_pfn(©_bm)); - if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, - * because we can't return with a kmapped - * highmem page (we may not be called again). - */ - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - copy_page(buffer, kaddr); - kunmap_atomic(kaddr, KM_USER0); - handle->buffer = buffer; - } else { - handle->buffer = page_address(page); - } - } - handle->cur++; - return PAGE_SIZE; -} - -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) -{ - struct zone *zone; - unsigned long pfn, max_zone_pfn; - - /* Clear page flags */ - for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); - } - - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); - - allocated_unsafe_pages = 0; - - return 0; -} - -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) -{ - unsigned long pfn; - - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); - while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); - } -} - -static int check_header(struct swsusp_info *info) -{ - char *reason; - - reason = check_image_kernel(info); - if (!reason && info->num_physpages != num_physpages) - reason = "memory size"; - if (reason) { - printk(KERN_ERR "PM: Image mismatch: %s\n", reason); - return -EPERM; - } - return 0; -} - -/** - * load header - check the image header and copy data from it - */ - -static int -load_header(struct swsusp_info *info) -{ - int error; - - restore_pblist = NULL; - error = check_header(info); - if (!error) { - nr_copy_pages = info->image_pages; - nr_meta_pages = info->pages - info->image_pages - 1; - } - return error; -} - -/** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm - */ -static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - if (unlikely(buf[j] == BM_END_OF_MAP)) - break; - - /* Extract and buffer page key for data page (s390 only). */ - page_key_memorize(buf + j); - - if (memory_bm_pfn_present(bm, buf[j])) - memory_bm_set_bit(bm, buf[j]); - else - return -EFAULT; - } - - return 0; -} - -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - -#ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that - * should be restored atomically during the resume from disk, because the page - * frames they have occupied before the suspend are in use. - */ -struct highmem_pbe { - struct page *copy_page; /* data is here now */ - struct page *orig_page; /* data was here before the suspend */ - struct highmem_pbe *next; -}; - -/* List of highmem PBEs needed for restoring the highmem pages that were - * allocated before the suspend and included in the suspend image, but have - * also been allocated by the "resume" kernel, so their contents cannot be - * written directly to their "original" page frames. - */ -static struct highmem_pbe *highmem_pblist; - -/** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. - */ - -static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) -{ - unsigned long pfn; - unsigned int cnt = 0; - - memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); - while (pfn != BM_END_OF_MAP) { - if (PageHighMem(pfn_to_page(pfn))) - cnt++; - - pfn = memory_bm_next_pfn(bm); - } - return cnt; -} - -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - -static unsigned int safe_highmem_pages; - -static struct memory_bitmap *safe_highmem_bm; - -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - unsigned int to_alloc; - - if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) - return -ENOMEM; - - if (get_highmem_buffer(PG_SAFE)) - return -ENOMEM; - - to_alloc = count_free_highmem_pages(); - if (to_alloc > *nr_highmem_p) - to_alloc = *nr_highmem_p; - else - *nr_highmem_p = to_alloc; - - safe_highmem_pages = 0; - while (to_alloc-- > 0) { - struct page *page; - - page = alloc_page(__GFP_HIGHMEM); - if (!swsusp_page_is_free(page)) { - /* The page is "safe", set its bit the bitmap */ - memory_bm_set_bit(bm, page_to_pfn(page)); - safe_highmem_pages++; - } - /* Mark the page as allocated */ - swsusp_set_page_forbidden(page); - swsusp_set_page_free(page); - } - memory_bm_position_reset(bm); - safe_highmem_bm = bm; - return 0; -} - -/** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. - * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. - * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. - */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) -{ - struct highmem_pbe *pbe; - void *kaddr; - - if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can - * use it directly to store the loaded page. - */ - last_highmem_page = page; - return buffer; - } - /* The "original" page frame has not been allocated and we have to - * use a "safe" page frame to store the loaded page. - */ - pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); - if (!pbe) { - swsusp_free(); - return ERR_PTR(-ENOMEM); - } - pbe->orig_page = page; - if (safe_highmem_pages > 0) { - struct page *tmp; - - /* Copy of the page will be stored in high memory */ - kaddr = buffer; - tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); - safe_highmem_pages--; - last_highmem_page = tmp; - pbe->copy_page = tmp; - } else { - /* Copy of the page will be stored in normal memory */ - kaddr = safe_pages_list; - safe_pages_list = safe_pages_list->next; - pbe->copy_page = virt_to_page(kaddr); - } - pbe->next = highmem_pblist; - highmem_pblist = pbe; - return kaddr; -} - -/** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . - */ - -static void copy_last_highmem_page(void) -{ - if (last_highmem_page) { - void *dst; - - dst = kmap_atomic(last_highmem_page, KM_USER0); - copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); - last_highmem_page = NULL; - } -} - -static inline int last_highmem_page_copied(void) -{ - return !last_highmem_page; -} - -static inline void free_highmem_data(void) -{ - if (safe_highmem_bm) - memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); - - if (buffer) - free_image_page(buffer, PG_UNSAFE_CLEAR); -} -#else -static inline int get_safe_write_buffer(void) { return 0; } - -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } - -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} - -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) -{ - return ERR_PTR(-EINVAL); -} - -static inline void copy_last_highmem_page(void) {} -static inline int last_highmem_page_copied(void) { return 1; } -static inline void free_highmem_data(void) {} -#endif /* CONFIG_HIGHMEM */ - -/** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. - * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. - */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) -{ - unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; - int error; - - /* If there is no highmem, the buffer will not be necessary */ - free_image_page(buffer, PG_UNSAFE_CLEAR); - buffer = NULL; - - nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; - - error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); - if (error) - goto Free; - - duplicate_memory_bitmap(new_bm, bm); - memory_bm_free(bm, PG_UNSAFE_KEEP); - if (nr_highmem > 0) { - error = prepare_highmem_image(bm, &nr_highmem); - if (error) - goto Free; - } - /* Reserve some safe pages for potential later use. - * - * NOTE: This way we make sure there will be enough safe pages for the - * chain_alloc() in get_buffer(). It is a bit wasteful, but - * nr_copy_pages cannot be greater than 50% of the memory anyway. - */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; - nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); - while (nr_pages > 0) { - lp = get_image_page(GFP_ATOMIC, PG_SAFE); - if (!lp) { - error = -ENOMEM; - goto Free; - } - lp->next = sp_list; - sp_list = lp; - nr_pages--; - } - /* Preallocate memory for the image */ - safe_pages_list = NULL; - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; - while (nr_pages > 0) { - lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); - if (!lp) { - error = -ENOMEM; - goto Free; - } - if (!swsusp_page_is_free(virt_to_page(lp))) { - /* The page is "safe", add it to the list */ - lp->next = safe_pages_list; - safe_pages_list = lp; - } - /* Mark the page as allocated */ - swsusp_set_page_forbidden(virt_to_page(lp)); - swsusp_set_page_free(virt_to_page(lp)); - nr_pages--; - } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } - return 0; - - Free: - swsusp_free(); - return error; -} - -/** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. - */ - -static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) -{ - struct pbe *pbe; - struct page *page; - unsigned long pfn = memory_bm_next_pfn(bm); - - if (pfn == BM_END_OF_MAP) - return ERR_PTR(-EFAULT); - - page = pfn_to_page(pfn); - if (PageHighMem(page)) - return get_highmem_page_buffer(page, ca); - - if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can - * use it directly to store the loaded page. - */ - return page_address(page); - - /* The "original" page frame has not been allocated and we have to - * use a "safe" page frame to store the loaded page. - */ - pbe = chain_alloc(ca, sizeof(struct pbe)); - if (!pbe) { - swsusp_free(); - return ERR_PTR(-ENOMEM); - } - pbe->orig_address = page_address(page); - pbe->address = safe_pages_list; - safe_pages_list = safe_pages_list->next; - pbe->next = restore_pblist; - restore_pblist = pbe; - return pbe->address; -} - -/** - * snapshot_write_next - used for writing the system memory snapshot. - * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. - * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. - * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. - */ - -int snapshot_write_next(struct snapshot_handle *handle) -{ - static struct chain_allocator ca; - int error = 0; - - /* Check if we have already loaded the entire image */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) - return 0; - - handle->sync_read = 1; - - if (!handle->cur) { - if (!buffer) - /* This makes the buffer be freed by swsusp_free() */ - buffer = get_image_page(GFP_ATOMIC, PG_ANY); - - if (!buffer) - return -ENOMEM; - - handle->buffer = buffer; - } else if (handle->cur == 1) { - error = load_header(buffer); - if (error) - return error; - - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); - if (error) - return error; - - /* Allocate buffer for page keys. */ - error = page_key_alloc(nr_copy_pages); - if (error) - return error; - - } else if (handle->cur <= nr_meta_pages + 1) { - error = unpack_orig_pfns(buffer, ©_bm); - if (error) - return error; - - if (handle->cur == nr_meta_pages + 1) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (IS_ERR(handle->buffer)) - return PTR_ERR(handle->buffer); - } - } else { - copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); - handle->buffer = get_buffer(&orig_bm, &ca); - if (IS_ERR(handle->buffer)) - return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; - } - handle->cur++; - return PAGE_SIZE; -} - -/** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. - */ - -void snapshot_write_finalize(struct snapshot_handle *handle) -{ - copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); - page_key_free(); - /* Free only if we have loaded the image entirely */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); - free_highmem_data(); - } -} - -int snapshot_image_loaded(struct snapshot_handle *handle) -{ - return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); -} - -#ifdef CONFIG_HIGHMEM -/* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) -{ - void *kaddr1, *kaddr2; - - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); - copy_page(buf, kaddr1); - copy_page(kaddr1, kaddr2); - copy_page(kaddr2, buf); - kunmap_atomic(kaddr2, KM_USER1); - kunmap_atomic(kaddr1, KM_USER0); -} - -/** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. - * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. - */ - -int restore_highmem(void) -{ - struct highmem_pbe *pbe = highmem_pblist; - void *buf; - - if (!pbe) - return 0; - - buf = get_image_page(GFP_ATOMIC, PG_SAFE); - if (!buf) - return -ENOMEM; - - while (pbe) { - swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); - pbe = pbe->next; - } - free_image_page(buf, PG_UNSAFE_CLEAR); - return 0; -} -#endif /* CONFIG_HIGHMEM */ -/* - * kernel/power/suspend.c - Suspend to RAM and standby functionality. - * - * Copyright (c) 2003 Patrick Mochel - * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -const char *const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", -}; - -static const struct platform_suspend_ops *suspend_ops; - -/** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. - */ -void suspend_set_ops(const struct platform_suspend_ops *ops) -{ - lock_system_sleep(); - suspend_ops = ops; - unlock_system_sleep(); -} -EXPORT_SYMBOL_GPL(suspend_set_ops); - -bool valid_state(suspend_state_t state) -{ - /* - * All states need lowlevel support and need to be valid to the lowlevel - * implementation, no valid callback implies that none are valid. - */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); -} - -/** - * suspend_valid_only_mem - generic memory-only valid callback - * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. - */ -int suspend_valid_only_mem(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} -EXPORT_SYMBOL_GPL(suspend_valid_only_mem); - -static int suspend_test(int level) -{ -#ifdef CONFIG_PM_DEBUG - if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); - return 1; - } -#endif /* !CONFIG_PM_DEBUG */ - return 0; -} - -/** - * suspend_prepare - Do prep work before entering low-power state. - * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. - */ -static int suspend_prepare(void) -{ - int error; - - if (!suspend_ops || !suspend_ops->enter) - return -EPERM; - - pm_prepare_console(); - - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) - goto Finish; - - error = usermodehelper_disable(); - if (error) - goto Finish; - - error = suspend_freeze_processes(); - if (!error) - return 0; - - suspend_stats.failed_freeze++; - dpm_save_failed_step(SUSPEND_FREEZE); - usermodehelper_enable(); - Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); - return error; -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) -{ - local_irq_disable(); -} - -/* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) -{ - local_irq_enable(); -} - -/** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. - * - * This function should be called after devices have been suspended. - */ -static int suspend_enter(suspend_state_t state, bool *wakeup) -{ - int error; - - if (suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } - - error = dpm_suspend_noirq(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platform_finish; - } - - if (suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; - } - - if (suspend_test(TEST_PLATFORM)) - goto Platform_wake; - - error = disable_nonboot_cpus(); - if (error || suspend_test(TEST_CPUS)) - goto Enable_cpus; - - arch_suspend_disable_irqs(); - BUG_ON(!irqs_disabled()); - - error = syscore_suspend(); - if (!error) { - *wakeup = pm_wakeup_pending(); - if (!(suspend_test(TEST_CORE) || *wakeup)) { - error = suspend_ops->enter(state); - events_check_enabled = false; - } - syscore_resume(); - } - - arch_suspend_enable_irqs(); - BUG_ON(irqs_disabled()); - - Enable_cpus: - enable_nonboot_cpus(); - - Platform_wake: - if (suspend_ops->wake) - suspend_ops->wake(); - - dpm_resume_noirq(PMSG_RESUME); - - Platform_finish: - if (suspend_ops->finish) - suspend_ops->finish(); - - return error; -} - -/** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter - */ -int suspend_devices_and_enter(suspend_state_t state) -{ - int error; - bool wakeup = false; - - if (!suspend_ops) - return -ENOSYS; - - trace_machine_suspend(state); - if (suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - suspend_test_start(); - error = dpm_suspend_start(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Recover_platform; - } - suspend_test_finish("suspend devices"); - if (suspend_test(TEST_DEVICES)) - goto Recover_platform; - - do { - error = suspend_enter(state, &wakeup); - } while (!error && !wakeup - && suspend_ops->suspend_again && suspend_ops->suspend_again()); - - Resume_devices: - suspend_test_start(); - dpm_resume_end(PMSG_RESUME); - suspend_test_finish("resume devices"); - resume_console(); - Close: - if (suspend_ops->end) - suspend_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); - return error; - - Recover_platform: - if (suspend_ops->recover) - suspend_ops->recover(); - goto Resume_devices; -} - -/** - * suspend_finish - Do final work before exiting suspend sequence. - * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. - */ -static void suspend_finish(void) -{ - suspend_thaw_processes(); - usermodehelper_enable(); - pm_notifier_call_chain(PM_POST_SUSPEND); - pm_restore_console(); -} - -/** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. - * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). - */ -int enter_state(suspend_state_t state) -{ - int error; - - if (!valid_state(state)) - return -ENODEV; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); - if (error) - goto Unlock; - - if (suspend_test(TEST_FREEZER)) - goto Finish; - - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - pm_restrict_gfp_mask(); - error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); - - Finish: - pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(); - Unlock: - mutex_unlock(&pm_mutex); - return error; -} - -/** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. - * - * Determine whether or not value is within range, get state - * structure, and enter (above). - */ -int pm_suspend(suspend_state_t state) -{ - int ret; - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { - ret = enter_state(state); - if (ret) { - suspend_stats.fail++; - dpm_save_failed_errno(ret); - } else - suspend_stats.success++; - return ret; - } - return -EINVAL; -} -EXPORT_SYMBOL(pm_suspend); -/* - * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. - * - * Copyright (c) 2009 Pavel Machek - * - * This file is released under the GPLv2. - */ - -#include -#include - -#include "power.h" - -/* - * We test the system suspend code by setting an RTC wakealarm a short - * time in the future, then suspending. Suspending the devices won't - * normally take long ... some systems only need a few milliseconds. - * - * The time it takes is system-specific though, so when we test this - * during system bootup we allow a LOT of time. - */ -#define TEST_SUSPEND_SECONDS 10 - -static unsigned long suspend_test_start_time; - -void suspend_test_start(void) -{ - /* FIXME Use better timebase than "jiffies", ideally a clocksource. - * What we want is a hardware counter that will work correctly even - * during the irqs-are-off stages of the suspend/resume cycle... - */ - suspend_test_start_time = jiffies; -} - -void suspend_test_finish(const char *label) -{ - long nj = jiffies - suspend_test_start_time; - unsigned msec; - - msec = jiffies_to_msecs(abs(nj)); - pr_info("PM: %s took %d.%03d seconds\n", label, - msec / 1000, msec % 1000); - - /* Warning on suspend means the RTC alarm period needs to be - * larger -- the system was sooo slooowwww to suspend that the - * alarm (should have) fired before the system went to sleep! - * - * Warning on either suspend or resume also means the system - * has some performance issues. The stack dump of a WARN_ON - * is more likely to get the right attention than a printk... - */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), - "Component: %s, time: %u\n", label, msec); -} - -/* - * To test system suspend, we need a hands-off mechanism to resume the - * system. RTCs wake alarms are a common self-contained mechanism. - */ - -static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) -{ - static char err_readtime[] __initdata = - KERN_ERR "PM: can't read %s time, err %d\n"; - static char err_wakealarm [] __initdata = - KERN_ERR "PM: can't set %s wakealarm, err %d\n"; - static char err_suspend[] __initdata = - KERN_ERR "PM: suspend test failed, error %d\n"; - static char info_test[] __initdata = - KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - - unsigned long now; - struct rtc_wkalrm alm; - int status; - - /* this may fail if the RTC hasn't been initialized */ - status = rtc_read_time(rtc, &alm.time); - if (status < 0) { - printk(err_readtime, dev_name(&rtc->dev), status); - return; - } - rtc_tm_to_time(&alm.time, &now); - - memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); - alm.enabled = true; - - status = rtc_set_alarm(rtc, &alm); - if (status < 0) { - printk(err_wakealarm, dev_name(&rtc->dev), status); - return; - } - - if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - if (status == -ENODEV) - state = PM_SUSPEND_STANDBY; - } - if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); - status = pm_suspend(state); - } - if (status < 0) - printk(err_suspend, status); - - /* Some platforms can't detect that the alarm triggered the - * wakeup, or (accordingly) disable it after it afterwards. - * It's supposed to give oneshot behavior; cope. - */ - alm.enabled = false; - rtc_set_alarm(rtc, &alm); -} - -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - -/* - * Kernel options like "test_suspend=mem" force suspend/resume sanity tests - * at startup time. They're normally disabled, for faster boot and because - * we can't know which states really work on this particular system. - */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; - -static char warn_bad_state[] __initdata = - KERN_WARNING "PM: can't test '%s' suspend state\n"; - -static int __init setup_test_suspend(char *value) -{ - unsigned i; - - /* "=mem" ==> "mem" */ - value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } - printk(warn_bad_state, value); - return 0; -} -__setup("test_suspend", setup_test_suspend); - -static int __init test_suspend(void) -{ - static char warn_no_rtc[] __initdata = - KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - - char *pony = NULL; - struct rtc_device *rtc = NULL; - - /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); - goto done; - } - - /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); - if (!rtc) { - printk(warn_no_rtc); - goto done; - } - - /* go for it */ - test_wakealarm(rtc, test_state); - rtc_class_close(rtc); -done: - return 0; -} -late_initcall(test_suspend); -/* - * linux/kernel/power/swap.c - * - * This file provides functions for reading the suspend image from - * and writing it to a swap partition. - * - * Copyright (C) 1998,2001-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * Copyright (C) 2010 Bojan Smojver - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "power.h" - -#define HIBERNATE_SIG "S1SUSPEND" - -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we pick up all swap_map_page structures into a list. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) - -struct swap_map_page { - sector_t entries[MAP_PAGE_ENTRIES]; - sector_t next_swap; -}; - -struct swap_map_page_list { - struct swap_map_page *map; - struct swap_map_page_list *next; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - struct swap_map_page_list *maps; - sector_t cur_swap; - sector_t first_sector; - unsigned int k; - unsigned long nr_free_pages, written; - u32 crc32; -}; - -struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - - sizeof(u32)]; - u32 crc32; - sector_t image; - unsigned int flags; /* Flags to pass to the "boot" kernel */ - char orig_sig[10]; - char sig[10]; -} __attribute__((packed)); - -static struct swsusp_header *swsusp_header; - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - */ - -struct swsusp_extent { - struct rb_node node; - unsigned long start; - unsigned long end; -}; - -static struct rb_root swsusp_extents = RB_ROOT; - -static int swsusp_extents_insert(unsigned long swap_offset) -{ - struct rb_node **new = &(swsusp_extents.rb_node); - struct rb_node *parent = NULL; - struct swsusp_extent *ext; - - /* Figure out where to put the new node */ - while (*new) { - ext = container_of(*new, struct swsusp_extent, node); - parent = *new; - if (swap_offset < ext->start) { - /* Try to merge */ - if (swap_offset == ext->start - 1) { - ext->start--; - return 0; - } - new = &((*new)->rb_left); - } else if (swap_offset > ext->end) { - /* Try to merge */ - if (swap_offset == ext->end + 1) { - ext->end++; - return 0; - } - new = &((*new)->rb_right); - } else { - /* It already is in the tree */ - return -EINVAL; - } - } - /* Add the new node and rebalance the tree. */ - ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); - if (!ext) - return -ENOMEM; - - ext->start = swap_offset; - ext->end = swap_offset; - rb_link_node(&ext->node, parent, new); - rb_insert_color(&ext->node, &swsusp_extents); - return 0; -} - -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - -sector_t alloc_swapdev_block(int swap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); - else - return swapdev_block(swap, offset); - } - return 0; -} - -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - -void free_all_swap_pages(int swap) -{ - struct rb_node *node; - - while ((node = swsusp_extents.rb_node)) { - struct swsusp_extent *ext; - unsigned long offset; - - ext = container_of(node, struct swsusp_extent, node); - rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); - - kfree(ext); - } -} - -int swsusp_swap_in_use(void) -{ - return (swsusp_extents.rb_node != NULL); -} - -/* - * General things - */ - -static unsigned short root_swap = 0xffff; -struct block_device *hib_resume_bdev; - -/* - * Saving part - */ - -static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) -{ - int error; - - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); - if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { - memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); - swsusp_header->image = handle->first_sector; - swsusp_header->flags = flags; - if (flags & SF_CRC32_MODE) - swsusp_header->crc32 = handle->crc32; - error = hib_bio_write_page(swsusp_resume_block, - swsusp_header, NULL); - } else { - printk(KERN_ERR "PM: Swap header not found!\n"); - error = -ENODEV; - } - return error; -} - -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image - */ -static int swsusp_swap_check(void) -{ - int res; - - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); - if (res < 0) - return res; - - root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; - - res = set_blocksize(hib_resume_bdev, PAGE_SIZE); - if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); - - return res; -} - -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here - */ - -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) -{ - void *src; - int ret; - - if (!offset) - return -ENOSPC; - - if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; - } - return hib_bio_write_page(offset, src, bio_chain); -} - -static void release_swap_writer(struct swap_map_handle *handle) -{ - if (handle->cur) - free_page((unsigned long)handle->cur); - handle->cur = NULL; -} - -static int get_swap_writer(struct swap_map_handle *handle) -{ - int ret; - - ret = swsusp_swap_check(); - if (ret) { - if (ret != -ENOSPC) - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); - return ret; - } - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) { - ret = -ENOMEM; - goto err_close; - } - handle->cur_swap = alloc_swapdev_block(root_swap); - if (!handle->cur_swap) { - ret = -ENOSPC; - goto err_rel; - } - handle->k = 0; - handle->nr_free_pages = nr_free_pages() >> 1; - handle->written = 0; - handle->first_sector = handle->cur_swap; - return 0; -err_rel: - release_swap_writer(handle); -err_close: - swsusp_close(FMODE_WRITE); - return ret; -} - -static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) -{ - int error = 0; - sector_t offset; - - if (!handle->cur) - return -EINVAL; - offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); - if (error) - return error; - handle->cur->entries[handle->k++] = offset; - if (handle->k >= MAP_PAGE_ENTRIES) { - offset = alloc_swapdev_block(root_swap); - if (!offset) - return -ENOSPC; - handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, bio_chain); - if (error) - goto out; - clear_page(handle->cur); - handle->cur_swap = offset; - handle->k = 0; - } - if (bio_chain && ++handle->written > handle->nr_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; - handle->written = 0; - } - out: - return error; -} - -static int flush_swap_writer(struct swap_map_handle *handle) -{ - if (handle->cur && handle->cur_swap) - return write_page(handle->cur, handle->cur_swap, NULL); - else - return -EINVAL; -} - -static int swap_writer_finish(struct swap_map_handle *handle, - unsigned int flags, int error) -{ - if (!error) { - flush_swap_writer(handle); - printk(KERN_INFO "PM: S"); - error = mark_swapfiles(handle, flags); - printk("|\n"); - } - - if (error) - free_all_swap_pages(root_swap); - release_swap_writer(handle); - swsusp_close(FMODE_WRITE); - - return error; -} - -/* We need to remember how much compressed data we need to read. */ -#define LZO_HEADER sizeof(size_t) - -/* Number of pages/bytes we'll compress at one time. */ -#define LZO_UNC_PAGES 32 -#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) - -/* Number of pages/bytes we need for compressed data (worst case). */ -#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ - LZO_HEADER, PAGE_SIZE) -#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) - -/* Maximum number of threads for compression/decompression. */ -#define LZO_THREADS 3 - -/* Maximum number of pages for read buffering. */ -#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) - - -/** - * save_image - save the suspend image data - */ - -static int save_image(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) -{ - unsigned int m; - int ret; - int nr_pages; - int err2; - struct bio *bio; - struct timeval start; - struct timeval stop; - - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", - nr_to_write); - m = nr_to_write / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - while (1) { - ret = snapshot_read_next(snapshot); - if (ret <= 0) - break; - ret = swap_write_page(handle, data_of(*snapshot), &bio); - if (ret) - break; - if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); - if (!ret) - ret = err2; - if (!ret) - printk(KERN_CONT "\b\b\b\bdone\n"); - else - printk(KERN_CONT "\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return ret; -} - -/** - * Structure used for CRC32. - */ -struct crc_data { - struct task_struct *thr; /* thread */ - atomic_t ready; /* ready to start flag */ - atomic_t stop; /* ready to stop flag */ - unsigned run_threads; /* nr current threads */ - wait_queue_head_t go; /* start crc update */ - wait_queue_head_t done; /* crc update done */ - u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ - unsigned char *unc[LZO_THREADS]; /* uncompressed data */ -}; - -/** - * CRC32 update function that runs in its own thread. - */ -static int crc32_threadfn(void *data) -{ - struct crc_data *d = data; - unsigned i; - - while (1) { - wait_event(d->go, atomic_read(&d->ready) || - kthread_should_stop()); - if (kthread_should_stop()) { - d->thr = NULL; - atomic_set(&d->stop, 1); - wake_up(&d->done); - break; - } - atomic_set(&d->ready, 0); - - for (i = 0; i < d->run_threads; i++) - *d->crc32 = crc32_le(*d->crc32, - d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); - wake_up(&d->done); - } - return 0; -} -/** - * Structure used for LZO data compression. - */ -struct cmp_data { - struct task_struct *thr; /* thread */ - atomic_t ready; /* ready to start flag */ - atomic_t stop; /* ready to stop flag */ - int ret; /* return code */ - wait_queue_head_t go; /* start compression */ - wait_queue_head_t done; /* compression done */ - size_t unc_len; /* uncompressed length */ - size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ - unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ -}; - -/** - * Compression function that runs in its own thread. - */ -static int lzo_compress_threadfn(void *data) -{ - struct cmp_data *d = data; - - while (1) { - wait_event(d->go, atomic_read(&d->ready) || - kthread_should_stop()); - if (kthread_should_stop()) { - d->thr = NULL; - d->ret = -1; - atomic_set(&d->stop, 1); - wake_up(&d->done); - break; - } - atomic_set(&d->ready, 0); - - d->ret = lzo1x_1_compress(d->unc, d->unc_len, - d->cmp + LZO_HEADER, &d->cmp_len, - d->wrk); - atomic_set(&d->stop, 1); - wake_up(&d->done); - } - return 0; -} - -/** - * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ -static int save_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) -{ - unsigned int m; - int ret = 0; - int nr_pages; - int err2; - struct bio *bio; - struct timeval start; - struct timeval stop; - size_t off; - unsigned thr, run_threads, nr_threads; - unsigned char *page = NULL; - struct cmp_data *data = NULL; - struct crc_data *crc = NULL; - - /* - * We'll limit the number of threads for compression to limit memory - * footprint. - */ - nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - ret = -ENOMEM; - goto out_clean; - } - - data = vmalloc(sizeof(*data) * nr_threads); - if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); - ret = -ENOMEM; - goto out_clean; - } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct cmp_data, go)); - - crc = kmalloc(sizeof(*crc), GFP_KERNEL); - if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); - ret = -ENOMEM; - goto out_clean; - } - memset(crc, 0, offsetof(struct crc_data, go)); - - /* - * Start the compression threads. - */ - for (thr = 0; thr < nr_threads; thr++) { - init_waitqueue_head(&data[thr].go); - init_waitqueue_head(&data[thr].done); - - data[thr].thr = kthread_run(lzo_compress_threadfn, - &data[thr], - "image_compress/%u", thr); - if (IS_ERR(data[thr].thr)) { - data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start compression threads\n"); - ret = -ENOMEM; - goto out_clean; - } - } - - /* - * Adjust number of free pages after all allocations have been done. - * We don't want to run out of pages when writing. - */ - handle->nr_free_pages = nr_free_pages() >> 1; - - /* - * Start the CRC32 thread. - */ - init_waitqueue_head(&crc->go); - init_waitqueue_head(&crc->done); - - handle->crc32 = 0; - crc->crc32 = &handle->crc32; - for (thr = 0; thr < nr_threads; thr++) { - crc->unc[thr] = data[thr].unc; - crc->unc_len[thr] = &data[thr].unc_len; - } - - crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); - if (IS_ERR(crc->thr)) { - crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); - ret = -ENOMEM; - goto out_clean; - } - - printk(KERN_INFO - "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages) ... ", - nr_threads, nr_to_write); - m = nr_to_write / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - for (;;) { - for (thr = 0; thr < nr_threads; thr++) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { - ret = snapshot_read_next(snapshot); - if (ret < 0) - goto out_finish; - - if (!ret) - break; - - memcpy(data[thr].unc + off, - data_of(*snapshot), PAGE_SIZE); - - if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", - nr_pages / m); - nr_pages++; - } - if (!off) - break; - - data[thr].unc_len = off; - - atomic_set(&data[thr].ready, 1); - wake_up(&data[thr].go); - } - - if (!thr) - break; - - crc->run_threads = thr; - atomic_set(&crc->ready, 1); - wake_up(&crc->go); - - for (run_threads = thr, thr = 0; thr < run_threads; thr++) { - wait_event(data[thr].done, - atomic_read(&data[thr].stop)); - atomic_set(&data[thr].stop, 0); - - ret = data[thr].ret; - - if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); - goto out_finish; - } - - if (unlikely(!data[thr].cmp_len || - data[thr].cmp_len > - lzo1x_worst_compress(data[thr].unc_len))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); - ret = -1; - goto out_finish; - } - - *(size_t *)data[thr].cmp = data[thr].cmp_len; - - /* - * Given we are writing one page at a time to disk, we - * copy that much from the buffer, although the last - * bit will likely be smaller than full page. This is - * OK - we saved the length of the compressed data, so - * any garbage at the end will be discarded when we - * read it. - */ - for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; - off += PAGE_SIZE) { - memcpy(page, data[thr].cmp + off, PAGE_SIZE); - - ret = swap_write_page(handle, page, &bio); - if (ret) - goto out_finish; - } - } - - wait_event(crc->done, atomic_read(&crc->stop)); - atomic_set(&crc->stop, 0); - } - -out_finish: - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); - if (!ret) - ret = err2; - if (!ret) { - printk(KERN_CONT "\b\b\b\bdone\n"); - } else { - printk(KERN_CONT "\n"); - } - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); -out_clean: - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } - if (data) { - for (thr = 0; thr < nr_threads; thr++) - if (data[thr].thr) - kthread_stop(data[thr].thr); - vfree(data); - } - if (page) free_page((unsigned long)page); - - return ret; -} - -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. - */ - -static int enough_swap(unsigned int nr_pages, unsigned int flags) -{ - unsigned int free_swap = count_swap_pages(root_swap, 1); - unsigned int required; - - pr_debug("PM: Free swap pages: %u\n", free_swap); - - required = PAGES_FOR_IO + nr_pages; - return free_swap > required; -} - -/** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header - * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) - */ - -int swsusp_write(unsigned int flags) -{ - struct swap_map_handle handle; - struct snapshot_handle snapshot; - struct swsusp_info *header; - unsigned long pages; - int error; - - pages = snapshot_get_image_size(); - error = get_swap_writer(&handle); - if (error) { - printk(KERN_ERR "PM: Cannot get swap writer\n"); - return error; - } - if (flags & SF_NOCOMPRESS_MODE) { - if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out_finish; - } - } - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot); - if (error < PAGE_SIZE) { - if (error >= 0) - error = -EFAULT; - - goto out_finish; - } - header = (struct swsusp_info *)data_of(snapshot); - error = swap_write_page(&handle, header, NULL); - if (!error) { - error = (flags & SF_NOCOMPRESS_MODE) ? - save_image(&handle, &snapshot, pages - 1) : - save_image_lzo(&handle, &snapshot, pages - 1); - } -out_finish: - error = swap_writer_finish(&handle, flags, error); - return error; -} - -/** - * The following functions allow us to read data using a swap map - * in a file-alike way - */ - -static void release_swap_reader(struct swap_map_handle *handle) -{ - struct swap_map_page_list *tmp; - - while (handle->maps) { - if (handle->maps->map) - free_page((unsigned long)handle->maps->map); - tmp = handle->maps; - handle->maps = handle->maps->next; - kfree(tmp); - } - handle->cur = NULL; -} - -static int get_swap_reader(struct swap_map_handle *handle, - unsigned int *flags_p) -{ - int error; - struct swap_map_page_list *tmp, *last; - sector_t offset; - - *flags_p = swsusp_header->flags; - - if (!swsusp_header->image) /* how can this happen? */ - return -EINVAL; - - handle->cur = NULL; - last = handle->maps = NULL; - offset = swsusp_header->image; - while (offset) { - tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); - if (!tmp) { - release_swap_reader(handle); - return -ENOMEM; - } - memset(tmp, 0, sizeof(*tmp)); - if (!handle->maps) - handle->maps = tmp; - if (last) - last->next = tmp; - last = tmp; - - tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!tmp->map) { - release_swap_reader(handle); - return -ENOMEM; - } - - error = hib_bio_read_page(offset, tmp->map, NULL); - if (error) { - release_swap_reader(handle); - return error; - } - offset = tmp->map->next_swap; - } - handle->k = 0; - handle->cur = handle->maps->map; - return 0; -} - -static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) -{ - sector_t offset; - int error; - struct swap_map_page_list *tmp; - - if (!handle->cur) - return -EINVAL; - offset = handle->cur->entries[handle->k]; - if (!offset) - return -EFAULT; - error = hib_bio_read_page(offset, buf, bio_chain); - if (error) - return error; - if (++handle->k >= MAP_PAGE_ENTRIES) { - handle->k = 0; - free_page((unsigned long)handle->maps->map); - tmp = handle->maps; - handle->maps = handle->maps->next; - kfree(tmp); - if (!handle->maps) - release_swap_reader(handle); - else - handle->cur = handle->maps->map; - } - return error; -} - -static int swap_reader_finish(struct swap_map_handle *handle) -{ - release_swap_reader(handle); - - return 0; -} - -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - -static int load_image(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) -{ - unsigned int m; - int ret = 0; - struct timeval start; - struct timeval stop; - struct bio *bio; - int err2; - unsigned nr_pages; - - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", - nr_to_read); - m = nr_to_read / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - for ( ; ; ) { - ret = snapshot_write_next(snapshot); - if (ret <= 0) - break; - ret = swap_read_page(handle, data_of(*snapshot), &bio); - if (ret) - break; - if (snapshot->sync_read) - ret = hib_wait_on_bio_chain(&bio); - if (ret) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); - if (!ret) - ret = err2; - if (!ret) { - printk("\b\b\b\bdone\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) - ret = -ENODATA; - } else - printk("\n"); - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return ret; -} - -/** - * Structure used for LZO data decompression. - */ -struct dec_data { - struct task_struct *thr; /* thread */ - atomic_t ready; /* ready to start flag */ - atomic_t stop; /* ready to stop flag */ - int ret; /* return code */ - wait_queue_head_t go; /* start decompression */ - wait_queue_head_t done; /* decompression done */ - size_t unc_len; /* uncompressed length */ - size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ -}; - -/** - * Deompression function that runs in its own thread. - */ -static int lzo_decompress_threadfn(void *data) -{ - struct dec_data *d = data; - - while (1) { - wait_event(d->go, atomic_read(&d->ready) || - kthread_should_stop()); - if (kthread_should_stop()) { - d->thr = NULL; - d->ret = -1; - atomic_set(&d->stop, 1); - wake_up(&d->done); - break; - } - atomic_set(&d->ready, 0); - - d->unc_len = LZO_UNC_SIZE; - d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, - d->unc, &d->unc_len); - atomic_set(&d->stop, 1); - wake_up(&d->done); - } - return 0; -} - -/** - * load_image_lzo - Load compressed image data and decompress them with LZO. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ -static int load_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) -{ - unsigned int m; - int ret = 0; - int eof = 0; - struct bio *bio; - struct timeval start; - struct timeval stop; - unsigned nr_pages; - size_t off; - unsigned i, thr, run_threads, nr_threads; - unsigned ring = 0, pg = 0, ring_size = 0, - have = 0, want, need, asked = 0; - unsigned long read_pages; - unsigned char **page = NULL; - struct dec_data *data = NULL; - struct crc_data *crc = NULL; - - /* - * We'll limit the number of threads for decompression to limit memory - * footprint. - */ - nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - - page = vmalloc(sizeof(*page) * LZO_READ_PAGES); - if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - ret = -ENOMEM; - goto out_clean; - } - - data = vmalloc(sizeof(*data) * nr_threads); - if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); - ret = -ENOMEM; - goto out_clean; - } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct dec_data, go)); - - crc = kmalloc(sizeof(*crc), GFP_KERNEL); - if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); - ret = -ENOMEM; - goto out_clean; - } - memset(crc, 0, offsetof(struct crc_data, go)); - - /* - * Start the decompression threads. - */ - for (thr = 0; thr < nr_threads; thr++) { - init_waitqueue_head(&data[thr].go); - init_waitqueue_head(&data[thr].done); - - data[thr].thr = kthread_run(lzo_decompress_threadfn, - &data[thr], - "image_decompress/%u", thr); - if (IS_ERR(data[thr].thr)) { - data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start decompression threads\n"); - ret = -ENOMEM; - goto out_clean; - } - } - - /* - * Start the CRC32 thread. - */ - init_waitqueue_head(&crc->go); - init_waitqueue_head(&crc->done); - - handle->crc32 = 0; - crc->crc32 = &handle->crc32; - for (thr = 0; thr < nr_threads; thr++) { - crc->unc[thr] = data[thr].unc; - crc->unc_len[thr] = &data[thr].unc_len; - } - - crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); - if (IS_ERR(crc->thr)) { - crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); - ret = -ENOMEM; - goto out_clean; - } - - /* - * Adjust number of pages for read buffering, in case we are short. - */ - read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; - read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); - - for (i = 0; i < read_pages; i++) { - page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT); - if (!page[i]) { - if (i < LZO_CMP_PAGES) { - ring_size = i; - printk(KERN_ERR - "PM: Failed to allocate LZO pages\n"); - ret = -ENOMEM; - goto out_clean; - } else { - break; - } - } - } - want = ring_size = i; - - printk(KERN_INFO - "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages) ... ", - nr_threads, nr_to_read); - m = nr_to_read / 100; - if (!m) - m = 1; - nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); - - ret = snapshot_write_next(snapshot); - if (ret <= 0) - goto out_finish; - - for(;;) { - for (i = 0; !eof && i < want; i++) { - ret = swap_read_page(handle, page[ring], &bio); - if (ret) { - /* - * On real read error, finish. On end of data, - * set EOF flag and just exit the read loop. - */ - if (handle->cur && - handle->cur->entries[handle->k]) { - goto out_finish; - } else { - eof = 1; - break; - } - } - if (++ring >= ring_size) - ring = 0; - } - asked += i; - want -= i; - - /* - * We are out of data, wait for some more. - */ - if (!have) { - if (!asked) - break; - - ret = hib_wait_on_bio_chain(&bio); - if (ret) - goto out_finish; - have += asked; - asked = 0; - if (eof) - eof = 2; - } - - if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); - atomic_set(&crc->stop, 0); - crc->run_threads = 0; - } - - for (thr = 0; have && thr < nr_threads; thr++) { - data[thr].cmp_len = *(size_t *)page[pg]; - if (unlikely(!data[thr].cmp_len || - data[thr].cmp_len > - lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); - ret = -1; - goto out_finish; - } - - need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, - PAGE_SIZE); - if (need > have) { - if (eof > 1) { - ret = -1; - goto out_finish; - } - break; - } - - for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; - off += PAGE_SIZE) { - memcpy(data[thr].cmp + off, - page[pg], PAGE_SIZE); - have--; - want++; - if (++pg >= ring_size) - pg = 0; - } - - atomic_set(&data[thr].ready, 1); - wake_up(&data[thr].go); - } - - /* - * Wait for more data while we are decompressing. - */ - if (have < LZO_CMP_PAGES && asked) { - ret = hib_wait_on_bio_chain(&bio); - if (ret) - goto out_finish; - have += asked; - asked = 0; - if (eof) - eof = 2; - } - - for (run_threads = thr, thr = 0; thr < run_threads; thr++) { - wait_event(data[thr].done, - atomic_read(&data[thr].stop)); - atomic_set(&data[thr].stop, 0); - - ret = data[thr].ret; - - if (ret < 0) { - printk(KERN_ERR - "PM: LZO decompression failed\n"); - goto out_finish; - } - - if (unlikely(!data[thr].unc_len || - data[thr].unc_len > LZO_UNC_SIZE || - data[thr].unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR - "PM: Invalid LZO uncompressed length\n"); - ret = -1; - goto out_finish; - } - - for (off = 0; - off < data[thr].unc_len; off += PAGE_SIZE) { - memcpy(data_of(*snapshot), - data[thr].unc + off, PAGE_SIZE); - - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - - ret = snapshot_write_next(snapshot); - if (ret <= 0) { - crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); - wake_up(&crc->go); - goto out_finish; - } - } - } - - crc->run_threads = thr; - atomic_set(&crc->ready, 1); - wake_up(&crc->go); - } - -out_finish: - if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); - atomic_set(&crc->stop, 0); - } - do_gettimeofday(&stop); - if (!ret) { - printk("\b\b\b\bdone\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) - ret = -ENODATA; - if (!ret) { - if (swsusp_header->flags & SF_CRC32_MODE) { - if(handle->crc32 != swsusp_header->crc32) { - printk(KERN_ERR - "PM: Invalid image CRC32!\n"); - ret = -ENODATA; - } - } - } - } else - printk("\n"); - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); -out_clean: - for (i = 0; i < ring_size; i++) - free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } - if (data) { - for (thr = 0; thr < nr_threads; thr++) - if (data[thr].thr) - kthread_stop(data[thr].thr); - vfree(data); - } - if (page) vfree(page); - - return ret; -} - -/** - * swsusp_read - read the hibernation image. - * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memory location - */ - -int swsusp_read(unsigned int *flags_p) -{ - int error; - struct swap_map_handle handle; - struct snapshot_handle snapshot; - struct swsusp_info *header; - - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot); - if (error < PAGE_SIZE) - return error < 0 ? error : -EFAULT; - header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, flags_p); - if (error) - goto end; - if (!error) - error = swap_read_page(&handle, header, NULL); - if (!error) { - error = (*flags_p & SF_NOCOMPRESS_MODE) ? - load_image(&handle, &snapshot, header->pages - 1) : - load_image_lzo(&handle, &snapshot, header->pages - 1); - } - swap_reader_finish(&handle); -end: - if (!error) - pr_debug("PM: Image successfully loaded\n"); - else - pr_debug("PM: Error %d resuming\n", error); - return error; -} - -/** - * swsusp_check - Check for swsusp signature in the resume device - */ - -int swsusp_check(void) -{ - int error; - - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); - if (!IS_ERR(hib_resume_bdev)) { - set_blocksize(hib_resume_bdev, PAGE_SIZE); - clear_page(swsusp_header); - error = hib_bio_read_page(swsusp_resume_block, - swsusp_header, NULL); - if (error) - goto put; - - if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { - memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); - /* Reset swap signature now */ - error = hib_bio_write_page(swsusp_resume_block, - swsusp_header, NULL); - } else { - error = -EINVAL; - } - -put: - if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); - else - pr_debug("PM: Image signature found, resuming\n"); - } else { - error = PTR_ERR(hib_resume_bdev); - } - - if (error) - pr_debug("PM: Image not found (code %d)\n", error); - - return error; -} - -/** - * swsusp_close - close swap device. - */ - -void swsusp_close(fmode_t mode) -{ - if (IS_ERR(hib_resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return; - } - - blkdev_put(hib_resume_bdev, mode); -} - -static int swsusp_header_init(void) -{ - swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); - if (!swsusp_header) - panic("Could not allocate memory for swsusp_header\n"); - return 0; -} - -core_initcall(swsusp_header_init); -/* - * linux/kernel/power/user.c - * - * This file provides the user space interface for software suspend/resume. - * - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "power.h" - - -#define SNAPSHOT_MINOR 231 - -static struct snapshot_data { - struct snapshot_handle handle; - int swap; - int mode; - char frozen; - char ready; - char platform_support; -} snapshot_state; - -atomic_t snapshot_device_available = ATOMIC_INIT(1); - -static int snapshot_open(struct inode *inode, struct file *filp) -{ - struct snapshot_data *data; - int error; - - lock_system_sleep(); - - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { - error = -EBUSY; - goto Unlock; - } - - if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); - error = -ENOSYS; - goto Unlock; - } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } - nonseekable_open(inode, filp); - data = &snapshot_state; - filp->private_data = data; - memset(&data->handle, 0, sizeof(struct snapshot_handle)); - if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { - /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; - data->mode = O_RDONLY; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); - } else { - /* - * Resuming. We may need to wait for the image device to - * appear. - */ - wait_for_device_probe(); - scsi_complete_async_scans(); - - data->swap = -1; - data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_RESTORE); - } - if (error) { - free_basic_memory_bitmaps(); - atomic_inc(&snapshot_device_available); - } - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; - - Unlock: - unlock_system_sleep(); - - return error; -} - -static int snapshot_release(struct inode *inode, struct file *filp) -{ - struct snapshot_data *data; - - lock_system_sleep(); - - swsusp_free(); - free_basic_memory_bitmaps(); - data = filp->private_data; - free_all_swap_pages(data->swap); - if (data->frozen) { - pm_restore_gfp_mask(); - thaw_processes(); - } - pm_notifier_call_chain(data->mode == O_RDONLY ? - PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); - - unlock_system_sleep(); - - return 0; -} - -static ssize_t snapshot_read(struct file *filp, char __user *buf, - size_t count, loff_t *offp) -{ - struct snapshot_data *data; - ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - - lock_system_sleep(); - - data = filp->private_data; - if (!data->ready) { - res = -ENODATA; - goto Unlock; - } - if (!pg_offp) { /* on page boundary? */ - res = snapshot_read_next(&data->handle); - if (res <= 0) - goto Unlock; - } else { - res = PAGE_SIZE - pg_offp; - } - - res = simple_read_from_buffer(buf, count, &pg_offp, - data_of(data->handle), res); - if (res > 0) - *offp += res; - - Unlock: - unlock_system_sleep(); - - return res; -} - -static ssize_t snapshot_write(struct file *filp, const char __user *buf, - size_t count, loff_t *offp) -{ - struct snapshot_data *data; - ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - - lock_system_sleep(); - - data = filp->private_data; - - if (!pg_offp) { - res = snapshot_write_next(&data->handle); - if (res <= 0) - goto unlock; - } else { - res = PAGE_SIZE - pg_offp; - } - - res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, - buf, count); - if (res > 0) - *offp += res; -unlock: - unlock_system_sleep(); - - return res; -} - -static long snapshot_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - int error = 0; - struct snapshot_data *data; - loff_t size; - sector_t offset; - - if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) - return -ENOTTY; - if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) - return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!mutex_trylock(&pm_mutex)) - return -EBUSY; - - data = filp->private_data; - - switch (cmd) { - - case SNAPSHOT_FREEZE: - if (data->frozen) - break; - - printk("Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); - - error = usermodehelper_disable(); - if (error) - break; - - error = freeze_processes(); - if (error) - usermodehelper_enable(); - else - data->frozen = 1; - break; - - case SNAPSHOT_UNFREEZE: - if (!data->frozen || data->ready) - break; - pm_restore_gfp_mask(); - thaw_processes(); - usermodehelper_enable(); - data->frozen = 0; - break; - - case SNAPSHOT_CREATE_IMAGE: - if (data->mode != O_RDONLY || !data->frozen || data->ready) { - error = -EPERM; - break; - } - pm_restore_gfp_mask(); - error = hibernation_snapshot(data->platform_support); - if (error) { - thaw_kernel_threads(); - } else { - error = put_user(in_suspend, (int __user *)arg); - if (!error && !freezer_test_done) - data->ready = 1; - if (freezer_test_done) { - freezer_test_done = false; - thaw_kernel_threads(); - } - } - break; - - case SNAPSHOT_ATOMIC_RESTORE: - snapshot_write_finalize(&data->handle); - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { - error = -EPERM; - break; - } - error = hibernation_restore(data->platform_support); - break; - - case SNAPSHOT_FREE: - swsusp_free(); - memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; - /* - * It is necessary to thaw kernel threads here, because - * SNAPSHOT_CREATE_IMAGE may be invoked directly after - * SNAPSHOT_FREE. In that case, if kernel threads were not - * thawed, the preallocation of memory carried out by - * hibernation_snapshot() might run into problems (i.e. it - * might fail or even deadlock). - */ - thaw_kernel_threads(); - break; - - case SNAPSHOT_PREF_IMAGE_SIZE: - image_size = arg; - break; - - case SNAPSHOT_GET_IMAGE_SIZE: - if (!data->ready) { - error = -ENODATA; - break; - } - size = snapshot_get_image_size(); - size <<= PAGE_SHIFT; - error = put_user(size, (loff_t __user *)arg); - break; - - case SNAPSHOT_AVAIL_SWAP_SIZE: - size = count_swap_pages(data->swap, 1); - size <<= PAGE_SHIFT; - error = put_user(size, (loff_t __user *)arg); - break; - - case SNAPSHOT_ALLOC_SWAP_PAGE: - if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { - error = -ENODEV; - break; - } - offset = alloc_swapdev_block(data->swap); - if (offset) { - offset <<= PAGE_SHIFT; - error = put_user(offset, (loff_t __user *)arg); - } else { - error = -ENOSPC; - } - break; - - case SNAPSHOT_FREE_SWAP_PAGES: - if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { - error = -ENODEV; - break; - } - free_all_swap_pages(data->swap); - break; - - case SNAPSHOT_S2RAM: - if (!data->frozen) { - error = -EPERM; - break; - } - /* - * Tasks are frozen and the notifiers have been called with - * PM_HIBERNATION_PREPARE - */ - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = 0; - break; - - case SNAPSHOT_PLATFORM_SUPPORT: - data->platform_support = !!arg; - break; - - case SNAPSHOT_POWER_OFF: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case SNAPSHOT_SET_SWAP_AREA: - if (swsusp_swap_in_use()) { - error = -EPERM; - } else { - struct resume_swap_area swap_area; - dev_t swdev; - - error = copy_from_user(&swap_area, (void __user *)arg, - sizeof(struct resume_swap_area)); - if (error) { - error = -EFAULT; - break; - } - - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - swdev = new_decode_dev(swap_area.dev); - if (swdev) { - offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } - break; - - default: - error = -ENOTTY; - - } - - mutex_unlock(&pm_mutex); - - return error; -} - -#ifdef CONFIG_COMPAT - -struct compat_resume_swap_area { - compat_loff_t offset; - u32 dev; -} __packed; - -static long -snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); - - switch (cmd) { - case SNAPSHOT_GET_IMAGE_SIZE: - case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_ALLOC_SWAP_PAGE: { - compat_loff_t __user *uoffset = compat_ptr(arg); - loff_t offset; - mm_segment_t old_fs; - int err; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, cmd, (unsigned long) &offset); - set_fs(old_fs); - if (!err && put_user(offset, uoffset)) - err = -EFAULT; - return err; - } - - case SNAPSHOT_CREATE_IMAGE: - return snapshot_ioctl(file, cmd, - (unsigned long) compat_ptr(arg)); - - case SNAPSHOT_SET_SWAP_AREA: { - struct compat_resume_swap_area __user *u_swap_area = - compat_ptr(arg); - struct resume_swap_area swap_area; - mm_segment_t old_fs; - int err; - - err = get_user(swap_area.offset, &u_swap_area->offset); - err |= get_user(swap_area.dev, &u_swap_area->dev); - if (err) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, - (unsigned long) &swap_area); - set_fs(old_fs); - return err; - } - - default: - return snapshot_ioctl(file, cmd, arg); - } -} - -#endif /* CONFIG_COMPAT */ - -static const struct file_operations snapshot_fops = { - .open = snapshot_open, - .release = snapshot_release, - .read = snapshot_read, - .write = snapshot_write, - .llseek = no_llseek, - .unlocked_ioctl = snapshot_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = snapshot_compat_ioctl, -#endif -}; - -static struct miscdevice snapshot_device = { - .minor = SNAPSHOT_MINOR, - .name = "snapshot", - .fops = &snapshot_fops, -}; - -static int __init snapshot_device_init(void) -{ - return misc_register(&snapshot_device); -}; - -device_initcall(snapshot_device_init); -/* - * linux/kernel/printk.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Modified to make sys_syslog() more flexible: added commands to - * return the last 4k of kernel messages, regardless of whether - * they've been read or not. Added option to suppress kernel printk's - * to the console. Added hook for sending the console messages - * elsewhere, in preparation for a serial line console (someday). - * Ted Ts'o, 2/11/93. - * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul - * manfred@colorfullife.com - * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For in_interrupt() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) - -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - -DECLARE_WAIT_QUEUE_HEAD(log_wait); - -int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ -}; - -/* - * Low level drivers may need that to know if they can schedule in - * their unblank() callback or not. So let's export it. - */ -int oops_in_progress; -EXPORT_SYMBOL(oops_in_progress); - -/* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console - * driver system. - */ -static DEFINE_SEMAPHORE(console_sem); -struct console *console_drivers; -EXPORT_SYMBOL_GPL(console_drivers); - -/* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held - */ -static int console_locked, console_suspended; - -/* - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars - * It is also used in interesting ways to provide interlocking in - * console_unlock();. - */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); - -#define LOG_BUF_MASK (log_buf_len-1) -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) - -/* - * The indices into log_buf are not constrained to log_buf_len - they - * must be masked before subscripting - */ -static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ -static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ -static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ - -/* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - -/* - * Array of consoles built from command line options (console=) - */ -struct console_cmdline -{ - char name[8]; /* Name of the driver */ - int index; /* Minor dev. to use */ - char *options; /* Options for the driver */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - char *brl_options; /* Options for braille driver */ -#endif -}; - -#define MAX_CMDLINECONSOLES 8 - -static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int selected_console = -1; -static int preferred_console = -1; -int console_set_on_cmdline; -EXPORT_SYMBOL(console_set_on_cmdline); - -/* Flag: console code may call schedule() */ -static int console_may_schedule; - -#ifdef CONFIG_PRINTK - -static char __log_buf[__LOG_BUF_LEN]; -static char *log_buf = __log_buf; -static int log_buf_len = __LOG_BUF_LEN; -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ -static int saved_console_loglevel = -1; - -#ifdef CONFIG_KEXEC -/* - * This appends the listed symbols to /proc/vmcoreinfo - * - * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to - * obtain access to symbols that are otherwise very difficult to locate. These - * symbols are specifically used so that utilities can access and extract the - * dmesg log from a vmcore file after a crash. - */ -void log_buf_kexec_setup(void) -{ - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_end); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(logged_chars); -} -#endif - -/* requested log_buf_len from kernel cmdline */ -static unsigned long __initdata new_log_buf_len; - -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) -{ - unsigned size = memparse(str, &str); - - if (size) - size = roundup_pow_of_two(size); - if (size > log_buf_len) - new_log_buf_len = size; - - return 0; -} -early_param("log_buf_len", log_buf_len_setup); - -void __init setup_log_buf(int early) -{ - unsigned long flags; - unsigned start, dest_idx, offset; - char *new_log_buf; - int free; - - if (!new_log_buf_len) - return; - - if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); - } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); - } - - if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", - new_log_buf_len); - return; - } - - raw_spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = new_log_buf_len; - log_buf = new_log_buf; - new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_end; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); - - log_buf[dest_idx] = __log_buf[log_idx_mask]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - pr_info("log_buf_len: %d\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", - free, (free * 100) / __LOG_BUF_LEN); -} - -#ifdef CONFIG_BOOT_PRINTK_DELAY - -static int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long loops_per_msec; /* based on boot_delay */ - -static int __init boot_delay_setup(char *str) -{ - unsigned long lpj; - - lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ - loops_per_msec = (unsigned long long)lpj / 1000 * HZ; - - get_option(&str, &boot_delay); - if (boot_delay > 10 * 1000) - boot_delay = 0; - - pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, loops_per_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, loops_per_msec); - return 1; -} -__setup("boot_delay=", boot_delay_setup); - -static void boot_delay_msec(void) -{ - unsigned long long k; - unsigned long timeout; - - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) - return; - - k = (unsigned long long)loops_per_msec * boot_delay; - - timeout = jiffies + msecs_to_jiffies(boot_delay); - while (k) { - k--; - cpu_relax(); - /* - * use (volatile) jiffies to prevent - * compiler reduction; loop termination via jiffies - * is secondary and may or may not happen. - */ - if (time_after(jiffies, timeout)) - break; - touch_nmi_watchdog(); - } -} -#else -static inline void boot_delay_msec(void) -{ -} -#endif - -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - -int do_syslog(int type, char __user *buf, int len, bool from_file) -{ - unsigned i, j, limit, count; - int do_clear = 0; - char c; - int error; - - error = check_syslog_permissions(type, from_file); - if (error) - goto out; - - error = security_syslog(type); - if (error) - return error; - - switch (type) { - case SYSLOG_ACTION_CLOSE: /* Close log */ - break; - case SYSLOG_ACTION_OPEN: /* Open log */ - break; - case SYSLOG_ACTION_READ: /* Read from log */ - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - error = wait_event_interruptible(log_wait, - (log_start - log_end)); - if (error) - goto out; - i = 0; - raw_spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,buf); - buf++; - i++; - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (!error) - error = i; - break; - /* Read/clear last kernel messages */ - case SYSLOG_ACTION_READ_CLEAR: - do_clear = 1; - /* FALL THRU */ - /* Read last kernel messages */ - case SYSLOG_ACTION_READ_ALL: - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - count = len; - if (count > log_buf_len) - count = log_buf_len; - raw_spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; - if (do_clear) - logged_chars = 0; - limit = log_end; - /* - * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages - * we try to copy to user space. Therefore - * the messages are copied in reverse. - */ - for (i = 0; i < count && !error; i++) { - j = limit-1-i; - if (j + log_buf_len < log_end) - break; - c = LOG_BUF(j); - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,&buf[count-1-i]); - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (error) - break; - error = i; - if (i != count) { - int offset = count-error; - /* buffer overflow during copy, correct user buffer. */ - for (i = 0; i < error; i++) { - if (__get_user(c,&buf[i+offset]) || - __put_user(c,&buf[i])) { - error = -EFAULT; - break; - } - cond_resched(); - } - } - break; - /* Clear ring buffer */ - case SYSLOG_ACTION_CLEAR: - logged_chars = 0; - break; - /* Disable logging to console */ - case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) - saved_console_loglevel = console_loglevel; - console_loglevel = minimum_console_loglevel; - break; - /* Enable logging to console */ - case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { - console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; - } - break; - /* Set level of messages printed to console */ - case SYSLOG_ACTION_CONSOLE_LEVEL: - error = -EINVAL; - if (len < 1 || len > 8) - goto out; - if (len < minimum_console_loglevel) - len = minimum_console_loglevel; - console_loglevel = len; - /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; - error = 0; - break; - /* Number of chars in the log buffer */ - case SYSLOG_ACTION_SIZE_UNREAD: - error = log_end - log_start; - break; - /* Size of the log buffer */ - case SYSLOG_ACTION_SIZE_BUFFER: - error = log_buf_len; - break; - default: - error = -EINVAL; - break; - } -out: - return error; -} - -SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) -{ - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); -} - -#ifdef CONFIG_KGDB_KDB -/* kdb dmesg command needs access to the syslog buffer. do_syslog() - * uses locks so it cannot be used during debugging. Just tell kdb - * where the start and end of the physical and logical logs are. This - * is equivalent to do_syslog(3). - */ -void kdb_syslog_data(char *syslog_data[4]) -{ - syslog_data[0] = log_buf; - syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_end - - (logged_chars < log_buf_len ? logged_chars : log_buf_len); - syslog_data[3] = log_buf + log_end; -} -#endif /* CONFIG_KGDB_KDB */ - -/* - * Call the console drivers on a range of log_buf - */ -static void __call_console_drivers(unsigned start, unsigned end) -{ - struct console *con; - - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) - con->write(con, &LOG_BUF(start), end - start); - } -} - -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - -/* - * Write out chars from start to end - 1 inclusive - */ -static void _call_console_drivers(unsigned start, - unsigned end, int msg_log_level) -{ - if ((msg_log_level < console_loglevel || ignore_loglevel) && - console_drivers && start != end) { - if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { - /* wrapped write */ - __call_console_drivers(start & LOG_BUF_MASK, - log_buf_len); - __call_console_drivers(0, end & LOG_BUF_MASK); - } else { - __call_console_drivers(start, end); - } - } -} - -/* - * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the - * lower 3 bit are the log level, the rest are the log facility. In case - * userspace passes usual userspace syslog messages to /dev/kmsg or - * /dev/ttyprintk, the log prefix might contain the facility. Printk needs - * to extract the correct log level for in-kernel processing, and not mangle - * the original value. - * - * If a prefix is found, the length of the prefix is returned. If 'level' is - * passed, it will be filled in with the log level without a possible facility - * value. If 'special' is passed, the special printk prefix chars are accepted - * and returned. If no valid header is found, 0 is returned and the passed - * variables are not touched. - */ -static size_t log_prefix(const char *p, unsigned int *level, char *special) -{ - unsigned int lev = 0; - char sp = '\0'; - size_t len; - - if (p[0] != '<' || !p[1]) - return 0; - if (p[2] == '>') { - /* usual single digit level number or special char */ - switch (p[1]) { - case '0' ... '7': - lev = p[1] - '0'; - break; - case 'c': /* KERN_CONT */ - case 'd': /* KERN_DEFAULT */ - sp = p[1]; - break; - default: - return 0; - } - len = 3; - } else { - /* multi digit including the level and facility number */ - char *endp = NULL; - - lev = (simple_strtoul(&p[1], &endp, 10) & 7); - if (endp == NULL || endp[0] != '>') - return 0; - len = (endp + 1) - p; - } - - /* do not accept special char if not asked for */ - if (sp && !special) - return 0; - - if (special) { - *special = sp; - /* return special char, do not touch level */ - if (sp) - return len; - } - - if (level) - *level = lev; - return len; -} - -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. - */ -static void call_console_drivers(unsigned start, unsigned end) -{ - unsigned cur_index, start_print; - static int msg_level = -1; - - BUG_ON(((int)(start - end)) > 0); - - cur_index = start; - start_print = start; - while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2)) { - /* strip log prefix */ - cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); - start_print = cur_index; - } - while (cur_index != end) { - char c = LOG_BUF(cur_index); - - cur_index++; - if (c == '\n') { - if (msg_level < 0) { - /* - * printk() has already given us loglevel tags in - * the buffer. This code is here in case the - * log buffer has wrapped right round and scribbled - * on those tags - */ - msg_level = default_message_loglevel; - } - _call_console_drivers(start_print, cur_index, msg_level); - msg_level = -1; - start_print = cur_index; - break; - } - } - } - _call_console_drivers(start_print, end, msg_level); -} - -static void emit_log_char(char c) -{ - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; -} - -/* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - debug_locks_off(); - /* If a crash is occurring, make sure we can't deadlock */ - raw_spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - sema_init(&console_sem, 1); -} - -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time = 0; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -static bool always_kmsg_dump; -module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); - -/* Check if we have any console registered that can be called early in boot. */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk(). It can be called from any context. We want it to work. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the output and - * call the console drivers. If we fail to get the semaphore we place the output - * into the log buffer and return. The current holder of the console_sem will - * notice the new output in console_unlock(); and will send it to the - * consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ - -asmlinkage int printk(const char *fmt, ...) -{ - va_list args; - int r; - -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif - va_start(args, fmt); - r = vprintk(fmt, args); - va_end(args); - - return r; -} - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int printk_cpu = UINT_MAX; - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. - */ -static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) -{ - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } - } - printk_cpu = UINT_MAX; - if (wake) - up(&console_sem); - raw_spin_unlock(&logbuf_lock); - return retval; -} -static const char recursion_bug_msg [] = - KERN_CRIT "BUG: recent printk recursion!\n"; -static int recursion_bug; -static int new_text_line = 1; -static char printk_buf[1024]; - -int printk_delay_msec __read_mostly; - -static inline void printk_delay(void) -{ - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; - - while (m--) { - mdelay(1); - touch_nmi_watchdog(); - } - } -} - -asmlinkage int vprintk(const char *fmt, va_list args) -{ - int printed_len = 0; - int current_log_level = default_message_loglevel; - unsigned long flags; - int this_cpu; - char *p; - size_t plen; - char special; - - boot_delay_msec(); - printk_delay(); - - /* This stops the holder of console_sem just where we want him */ - local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(printk_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; - goto out_restore_irqs; - } - zap_locks(); - } - - lockdep_off(); - raw_spin_lock(&logbuf_lock); - printk_cpu = this_cpu; - - if (recursion_bug) { - recursion_bug = 0; - strcpy(printk_buf, recursion_bug_msg); - printed_len = strlen(recursion_bug_msg); - } - /* Emit the output into the temporary buffer */ - printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf) - printed_len, fmt, args); - - p = printk_buf; - - /* Read log level and handle special printk prefix */ - plen = log_prefix(p, ¤t_log_level, &special); - if (plen) { - p += plen; - - switch (special) { - case 'c': /* Strip KERN_CONT, continue line */ - plen = 0; - break; - case 'd': /* Strip KERN_DEFAULT, start new line */ - plen = 0; - default: - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } - } - } - - /* - * Copy the output into log_buf. If the caller didn't provide - * the appropriate log prefix, we insert them here - */ - for (; *p; p++) { - if (new_text_line) { - new_text_line = 0; - - if (plen) { - /* Copy original log prefix */ - int i; - - for (i = 0; i < plen; i++) - emit_log_char(printk_buf[i]); - printed_len += plen; - } else { - /* Add log prefix */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; - } - - if (printk_time) { - /* Add the current time stamp */ - char tbuf[50], *tp; - unsigned tlen; - unsigned long long t; - unsigned long nanosec_rem; - - t = cpu_clock(printk_cpu); - nanosec_rem = do_div(t, 1000000000); - tlen = sprintf(tbuf, "[%5lu.%06lu] ", - (unsigned long) t, - nanosec_rem / 1000); - - for (tp = tbuf; tp < tbuf + tlen; tp++) - emit_log_char(*tp); - printed_len += tlen; - } - - if (!*p) - break; - } - - emit_log_char(*p); - if (*p == '\n') - new_text_line = 1; - } - - /* - * Try to acquire and then immediately release the - * console semaphore. The release will do all the - * actual magic (print out buffers, wake up klogd, - * etc). - * - * The console_trylock_for_printk() function - * will release 'logbuf_lock' regardless of whether it - * actually gets the semaphore or not. - */ - if (console_trylock_for_printk(this_cpu)) - console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); - - return printed_len; -} -EXPORT_SYMBOL(printk); -EXPORT_SYMBOL(vprintk); - -#else - -static void call_console_drivers(unsigned start, unsigned end) -{ -} - -#endif - -static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - if (!brl_options) - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - if (!brl_options) - selected_console = i; - c = &console_cmdline[i]; - strlcpy(c->name, name, sizeof(c->name)); - c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - c->brl_options = brl_options; -#endif - c->index = idx; - return 0; -} -/* - * Set up a list of consoles. Called from init/main.c - */ -static int __init console_setup(char *str) -{ - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ - char *s, *options, *brl_options = NULL; - int idx; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (!memcmp(str, "brl,", 4)) { - brl_options = ""; - str += 4; - } else if (!memcmp(str, "brl=", 4)) { - brl_options = str + 4; - str = strchr(brl_options, ','); - if (!str) { - printk(KERN_ERR "need port name after brl=\n"); - return 1; - } - *(str++) = 0; - } -#endif - - /* - * Decode str into name, index, options. - */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) - *(options++) = 0; -#ifdef __sparc__ - if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); - if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); -#endif - for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') - break; - idx = simple_strtoul(s, NULL, 10); - *s = 0; - - __add_preferred_console(buf, idx, options, brl_options); - console_set_on_cmdline = 1; - return 1; -} -__setup("console=", console_setup); - -/** - * add_preferred_console - add a device to the list of preferred consoles. - * @name: device name - * @idx: device index - * @options: options for this console - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int add_preferred_console(char *name, int idx, char *options) -{ - return __add_preferred_console(name, idx, options, NULL); -} - -int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) -{ - struct console_cmdline *c; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - c = &console_cmdline[i]; - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; - } - /* not found */ - return -1; -} - -bool console_suspend_enabled = 1; -EXPORT_SYMBOL(console_suspend_enabled); - -static int __init console_suspend_disable(char *str) -{ - console_suspend_enabled = 0; - return 1; -} -__setup("no_console_suspend", console_suspend_disable); -module_param_named(console_suspend, console_suspend_enabled, - bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(console_suspend, "suspend console during suspend" - " and hibernate operations"); - -/** - * suspend_console - suspend the console subsystem - * - * This disables printk() while we go into suspend states - */ -void suspend_console(void) -{ - if (!console_suspend_enabled) - return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); - console_lock(); - console_suspended = 1; - up(&console_sem); -} - -void resume_console(void) -{ - if (!console_suspend_enabled) - return; - down(&console_sem); - console_suspended = 0; - console_unlock(); -} - -/** - * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused - * - * If printk() is called from a CPU that is not online yet, the messages - * will be spooled but will not show up on the console. This function is - * called when a new CPU comes online (or fails to come up), and ensures - * that any such output gets printed. - */ -static int __cpuinit console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DYING: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: - console_lock(); - console_unlock(); - } - return NOTIFY_OK; -} - -/** - * console_lock - lock the console system for exclusive use. - * - * Acquires a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. - * - * Can sleep, returns nothing. - */ -void console_lock(void) -{ - BUG_ON(in_interrupt()); - down(&console_sem); - if (console_suspended) - return; - console_locked = 1; - console_may_schedule = 1; -} -EXPORT_SYMBOL(console_lock); - -/** - * console_trylock - try to lock the console system for exclusive use. - * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. - * - * returns 1 on success, and 0 on failure to acquire the lock. - */ -int console_trylock(void) -{ - if (down_trylock(&console_sem)) - return 0; - if (console_suspended) { - up(&console_sem); - return 0; - } - console_locked = 1; - console_may_schedule = 0; - return 1; -} -EXPORT_SYMBOL(console_trylock); - -int is_console_locked(void) -{ - return console_locked; -} - -static DEFINE_PER_CPU(int, printk_pending); - -void printk_tick(void) -{ - if (__this_cpu_read(printk_pending)) { - __this_cpu_write(printk_pending, 0); - wake_up_interruptible(&log_wait); - } -} - -int printk_needs_cpu(int cpu) -{ - if (cpu_is_offline(cpu)) - printk_tick(); - return __this_cpu_read(printk_pending); -} - -void wake_up_klogd(void) -{ - if (waitqueue_active(&log_wait)) - this_cpu_write(printk_pending, 1); -} - -/** - * console_unlock - unlock the console system - * - * Releases the console_lock which the caller holds on the console system - * and the console driver list. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * If there is output waiting for klogd, we wake it up. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) -{ - unsigned long flags; - unsigned _con_start, _log_end; - unsigned wake_klogd = 0, retry = 0; - - if (console_suspended) { - up(&console_sem); - return; - } - - console_may_schedule = 0; - -again: - for ( ; ; ) { - raw_spin_lock_irqsave(&logbuf_lock, flags); - wake_klogd |= log_start - log_end; - if (con_start == log_end) - break; /* Nothing to print */ - _con_start = con_start; - _log_end = log_end; - con_start = log_end; /* Flush */ - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(_con_start, _log_end); - start_critical_timings(); - local_irq_restore(flags); - } - console_locked = 0; - - /* Release the exclusive_console once it is used */ - if (unlikely(exclusive_console)) - exclusive_console = NULL; - - raw_spin_unlock(&logbuf_lock); - - up(&console_sem); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - raw_spin_lock(&logbuf_lock); - if (con_start != log_end) - retry = 1; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - if (retry && console_trylock()) - goto again; - - if (wake_klogd) - wake_up_klogd(); -} -EXPORT_SYMBOL(console_unlock); - -/** - * console_conditional_schedule - yield the CPU if required - * - * If the console code is currently allowed to sleep, and - * if this CPU should yield the CPU to another task, do - * so here. - * - * Must be called within console_lock();. - */ -void __sched console_conditional_schedule(void) -{ - if (console_may_schedule) - cond_resched(); -} -EXPORT_SYMBOL(console_conditional_schedule); - -void console_unblank(void) -{ - struct console *c; - - /* - * console_unblank can no longer be called in interrupt context unless - * oops_in_progress is set to 1.. - */ - if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) - return; - } else - console_lock(); - - console_locked = 1; - console_may_schedule = 0; - for_each_console(c) - if ((c->flags & CON_ENABLED) && c->unblank) - c->unblank(); - console_unlock(); -} - -/* - * Return the console tty driver structure and its associated index - */ -struct tty_driver *console_device(int *index) -{ - struct console *c; - struct tty_driver *driver = NULL; - - console_lock(); - for_each_console(c) { - if (!c->device) - continue; - driver = c->device(c, index); - if (driver) - break; - } - console_unlock(); - return driver; -} - -/* - * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can - * re-enable output afterwards. - */ -void console_stop(struct console *console) -{ - console_lock(); - console->flags &= ~CON_ENABLED; - console_unlock(); -} -EXPORT_SYMBOL(console_stop); - -void console_start(struct console *console) -{ - console_lock(); - console->flags |= CON_ENABLED; - console_unlock(); -} -EXPORT_SYMBOL(console_start); - -static int __read_mostly keep_bootcon; - -static int __init keep_bootcon_setup(char *str) -{ - keep_bootcon = 1; - printk(KERN_INFO "debug: skip boot console de-registration.\n"); - - return 0; -} - -early_param("keep_bootcon", keep_bootcon_setup); - -/* - * The console driver calls this routine during kernel initialization - * to register the console printing procedure with printk() and to - * print any messages that were printed by the kernel before the - * console driver was initialized. - * - * This can happen pretty early during the boot process (because of - * early_printk) - sometimes before setup_arch() completes - be careful - * of what kernel features are used - they may not be initialised yet. - * - * There are two types of consoles - bootconsoles (early_printk) and - * "real" consoles (everything which is not a bootconsole) which are - * handled differently. - * - Any number of bootconsoles can be registered at any time. - * - As soon as a "real" console is registered, all bootconsoles - * will be unregistered automatically. - * - Once a "real" console is registered, any attempt to register a - * bootconsoles will be rejected - */ -void register_console(struct console *newcon) -{ - int i; - unsigned long flags; - struct console *bcon = NULL; - - /* - * before we register a new CON_BOOT console, make sure we don't - * already have a valid console - */ - if (console_drivers && newcon->flags & CON_BOOT) { - /* find the last or real console */ - for_each_console(bcon) { - if (!(bcon->flags & CON_BOOT)) { - printk(KERN_INFO "Too late to register bootconsole %s%d\n", - newcon->name, newcon->index); - return; - } - } - } - - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; - - if (preferred_console < 0 || bcon || !console_drivers) - preferred_console = selected_console; - - if (newcon->early_setup) - newcon->early_setup(); - - /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. - */ - if (preferred_console < 0) { - if (newcon->index < 0) - newcon->index = 0; - if (newcon->setup == NULL || - newcon->setup(newcon, NULL) == 0) { - newcon->flags |= CON_ENABLED; - if (newcon->device) { - newcon->flags |= CON_CONSDEV; - preferred_console = 0; - } - } - } - - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; - i++) { - if (strcmp(console_cmdline[i].name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != console_cmdline[i].index) - continue; - if (newcon->index < 0) - newcon->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console_cmdline[i].brl_options) { - newcon->flags |= CON_BRL; - braille_register_console(newcon, - console_cmdline[i].index, - console_cmdline[i].options, - console_cmdline[i].brl_options); - return; - } -#endif - if (newcon->setup && - newcon->setup(newcon, console_cmdline[i].options) != 0) - break; - newcon->flags |= CON_ENABLED; - newcon->index = console_cmdline[i].index; - if (i == selected_console) { - newcon->flags |= CON_CONSDEV; - preferred_console = selected_console; - } - break; - } - - if (!(newcon->flags & CON_ENABLED)) - return; - - /* - * If we have a bootconsole, and are switching to a real console, - * don't print everything out again, since when the boot console, and - * the real console are the same physical device, it's annoying to - * see the beginning boot messages twice - */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) - newcon->flags &= ~CON_PRINTBUFFER; - - /* - * Put this console in the list - keep the - * preferred driver at the head of the list. - */ - console_lock(); - if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { - newcon->next = console_drivers; - console_drivers = newcon; - if (newcon->next) - newcon->next->flags &= ~CON_CONSDEV; - } else { - newcon->next = console_drivers->next; - console_drivers->next = newcon; - } - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - con_start = log_start; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - /* - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - */ - exclusive_console = newcon; - } - console_unlock(); - console_sysfs_notify(); - - /* - * By unregistering the bootconsoles after we enable the real console - * we get the "console xxx enabled" message on all the consoles - - * boot consoles, real consoles, etc - this is to ensure that end - * users know there might be something in the kernel's log buffer that - * went to the bootconsole (that they do not see on the real console) - */ - if (bcon && - ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && - !keep_bootcon) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) - */ - printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", - newcon->name, newcon->index); - for_each_console(bcon) - if (bcon->flags & CON_BOOT) - unregister_console(bcon); - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); - } -} -EXPORT_SYMBOL(register_console); - -int unregister_console(struct console *console) -{ - struct console *a, *b; - int res = 1; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console->flags & CON_BRL) - return braille_unregister_console(console); -#endif - - console_lock(); - if (console_drivers == console) { - console_drivers=console->next; - res = 0; - } else if (console_drivers) { - for (a=console_drivers->next, b=console_drivers ; - a; b=a, a=b->next) { - if (a == console) { - b->next = a->next; - res = 0; - break; - } - } - } - - /* - * If this isn't the last console and it has CON_CONSDEV set, we - * need to set it on the next preferred console. - */ - if (console_drivers != NULL && console->flags & CON_CONSDEV) - console_drivers->flags |= CON_CONSDEV; - - console_unlock(); - console_sysfs_notify(); - return res; -} -EXPORT_SYMBOL(unregister_console); - -static int __init printk_late_init(void) -{ - struct console *con; - - for_each_console(con) { - if (!keep_bootcon && con->flags & CON_BOOT) { - printk(KERN_INFO "turn off boot console %s%d\n", - con->name, con->index); - unregister_console(con); - } - } - hotcpu_notifier(console_cpu_notify, 0); - return 0; -} -late_initcall(printk_late_init); - -#if defined CONFIG_PRINTK - -/* - * printk rate limiting, lifted from the networking subsystem. - * - * This enforces a rate limit: not more than 10 kernel messages - * every 5s to make a denial-of-service attack impossible. - */ -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); - -int __printk_ratelimit(const char *func) -{ - return ___ratelimit(&printk_ratelimit_state, func); -} -EXPORT_SYMBOL(__printk_ratelimit); - -/** - * printk_timed_ratelimit - caller-controlled printk ratelimiting - * @caller_jiffies: pointer to caller's state - * @interval_msecs: minimum interval between prints - * - * printk_timed_ratelimit() returns true if more than @interval_msecs - * milliseconds have elapsed since the last time printk_timed_ratelimit() - * returned true. - */ -bool printk_timed_ratelimit(unsigned long *caller_jiffies, - unsigned int interval_msecs) -{ - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; -} -EXPORT_SYMBOL(printk_timed_ratelimit); - -static DEFINE_SPINLOCK(dump_list_lock); -static LIST_HEAD(dump_list); - -/** - * kmsg_dump_register - register a kernel log dumper. - * @dumper: pointer to the kmsg_dumper structure - * - * Adds a kernel log dumper to the system. The dump callback in the - * structure will be called when the kernel oopses or panics and must be - * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. - */ -int kmsg_dump_register(struct kmsg_dumper *dumper) -{ - unsigned long flags; - int err = -EBUSY; - - /* The dump callback needs to be set */ - if (!dumper->dump) - return -EINVAL; - - spin_lock_irqsave(&dump_list_lock, flags); - /* Don't allow registering multiple times */ - if (!dumper->registered) { - dumper->registered = 1; - list_add_tail_rcu(&dumper->list, &dump_list); - err = 0; - } - spin_unlock_irqrestore(&dump_list_lock, flags); - - return err; -} -EXPORT_SYMBOL_GPL(kmsg_dump_register); - -/** - * kmsg_dump_unregister - unregister a kmsg dumper. - * @dumper: pointer to the kmsg_dumper structure - * - * Removes a dump device from the system. Returns zero on success and - * %-EINVAL otherwise. - */ -int kmsg_dump_unregister(struct kmsg_dumper *dumper) -{ - unsigned long flags; - int err = -EINVAL; - - spin_lock_irqsave(&dump_list_lock, flags); - if (dumper->registered) { - dumper->registered = 0; - list_del_rcu(&dumper->list); - err = 0; - } - spin_unlock_irqrestore(&dump_list_lock, flags); - synchronize_rcu(); - - return err; -} -EXPORT_SYMBOL_GPL(kmsg_dump_unregister); - -/** - * kmsg_dump - dump kernel log to kernel message dumpers. - * @reason: the reason (oops, panic etc) for dumping - * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. - */ -void kmsg_dump(enum kmsg_dump_reason reason) -{ - unsigned long end; - unsigned chars; - struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; - unsigned long flags; - - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - end = log_end & LOG_BUF_MASK; - chars = logged_chars; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - if (chars > end) { - s1 = log_buf + log_buf_len - chars + end; - l1 = chars - end; - - s2 = log_buf; - l2 = end; - } else { - s1 = ""; - l1 = 0; - - s2 = log_buf + end - chars; - l2 = chars; - } - - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); -} -#endif -/* - * linux/kernel/profile.c - * Simple profiling. Manages a direct-mapped profile hit count buffer, - * with configurable resolution, support for restricting the cpus on - * which profiling is done, and switching between cpu time and - * schedule() calls via kernel command line parameters passed at boot. - * - * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, - * Red Hat, July 2004 - * Consolidation of architecture support code for profiling, - * William Irwin, Oracle, July 2004 - * Amortized hit count accounting via per-cpu open-addressed hashtables - * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct profile_hit { - u32 pc, hits; -}; -#define PROFILE_GRPSHIFT 3 -#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) -#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) -#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) - -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - -static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; - -int prof_on __read_mostly; -EXPORT_SYMBOL_GPL(prof_on); - -static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - -int profile_setup(char *str) -{ - static char schedstr[] = "schedule"; - static char sleepstr[] = "sleep"; - static char kvmstr[] = "kvm"; - int par; - - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", - prof_shift); -#else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { - prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", - prof_shift); - } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { - prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", - prof_shift); - } else if (get_option(&str, &par)) { - prof_shift = par; - prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", - prof_shift); - } - return 1; -} -__setup("profile=", profile_setup); - - -int __ref profile_init(void) -{ - int buffer_bytes; - if (!prof_on) - return 0; - - /* only text is profiled */ - prof_len = (_etext - _stext) >> prof_shift; - buffer_bytes = prof_len*sizeof(atomic_t); - - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); - if (prof_buffer) - return 0; - - prof_buffer = alloc_pages_exact(buffer_bytes, - GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); - if (prof_buffer) - return 0; - - prof_buffer = vzalloc(buffer_bytes); - if (prof_buffer) - return 0; - - free_cpumask_var(prof_cpu_mask); - return -ENOMEM; -} - -/* Profile event notifications */ - -static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); -static ATOMIC_NOTIFIER_HEAD(task_free_notifier); -static BLOCKING_NOTIFIER_HEAD(munmap_notifier); - -void profile_task_exit(struct task_struct *task) -{ - blocking_notifier_call_chain(&task_exit_notifier, 0, task); -} - -int profile_handoff_task(struct task_struct *task) -{ - int ret; - ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); - return (ret == NOTIFY_OK) ? 1 : 0; -} - -void profile_munmap(unsigned long addr) -{ - blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); -} - -int task_handoff_register(struct notifier_block *n) -{ - return atomic_notifier_chain_register(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_register); - -int task_handoff_unregister(struct notifier_block *n) -{ - return atomic_notifier_chain_unregister(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_unregister); - -int profile_event_register(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_register( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_register( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_register); - -int profile_event_unregister(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_unregister( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_unregister( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_unregister); - -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - -#ifdef CONFIG_SMP -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- wli - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int __cpuinit profile_cpu_callback(struct notifier_block *info, - unsigned long action, void *__cpu) -{ - int node, cpu = (unsigned long)__cpu; - struct page *page; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - node = cpu_to_mem(cpu); - per_cpu(cpu_profile_flip, cpu) = 0; - if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - return notifier_from_errno(-ENOMEM); - per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); - } - if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - goto out_free; - per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); - } - break; -out_free: - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - return notifier_from_errno(-ENOMEM); - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_set_cpu(cpu, prof_cpu_mask); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_clear_cpu(cpu, prof_cpu_mask); - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - break; - } - return NOTIFY_OK; -} -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) -#define profile_cpu_callback NULL - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long pc; - pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); -} -#endif /* !CONFIG_SMP */ - -void profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - if (prof_on != type || !prof_buffer) - return; - do_profile_hits(type, __pc, nr_hits); -} -EXPORT_SYMBOL_GPL(profile_hits); - -void profile_tick(int type) -{ - struct pt_regs *regs = get_irq_regs(); - - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); - if (!user_mode(regs) && prof_cpu_mask != NULL && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) - profile_hit(type, (void *)profile_pc(regs)); -} - -#ifdef CONFIG_PROC_FS -#include -#include -#include - -static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) -{ - seq_cpumask(m, prof_cpu_mask); - seq_putc(m, '\n'); - return 0; -} - -static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, prof_cpu_mask_proc_show, NULL); -} - -static ssize_t prof_cpu_mask_proc_write(struct file *file, - const char __user *buffer, size_t count, loff_t *pos) -{ - cpumask_var_t new_value; - int err; - - if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(buffer, count, new_value); - if (!err) { - cpumask_copy(prof_cpu_mask, new_value); - err = count; - } - free_cpumask_var(new_value); - return err; -} - -static const struct file_operations prof_cpu_mask_proc_fops = { - .open = prof_cpu_mask_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = prof_cpu_mask_proc_write, -}; - -void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) -{ - /* create /proc/irq/prof_cpu_mask */ - proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); -} - -/* - * This function accesses profiling information. The returned data is - * binary: the sampling step and the actual contents of the profile - * buffer. Use of the program readprofile is recommended in order to - * get meaningful info out of these data. - */ -static ssize_t -read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t read; - char *pnt; - unsigned int sample_step = 1 << prof_shift; - - profile_flip_buffers(); - if (p >= (prof_len+1)*sizeof(unsigned int)) - return 0; - if (count > (prof_len+1)*sizeof(unsigned int) - p) - count = (prof_len+1)*sizeof(unsigned int) - p; - read = 0; - - while (p < sizeof(unsigned int) && count > 0) { - if (put_user(*((char *)(&sample_step)+p), buf)) - return -EFAULT; - buf++; p++; count--; read++; - } - pnt = (char *)prof_buffer + p - sizeof(atomic_t); - if (copy_to_user(buf, (void *)pnt, count)) - return -EFAULT; - read += count; - *ppos += read; - return read; -} - -/* - * Writing to /proc/profile resets the counters - * - * Writing a 'profiling multiplier' value into it also re-sets the profiling - * interrupt frequency, on architectures that support this. - */ -static ssize_t write_profile(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ -#ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - - if (count == sizeof(int)) { - unsigned int multiplier; - - if (copy_from_user(&multiplier, buf, sizeof(int))) - return -EFAULT; - - if (setup_profiling_timer(multiplier)) - return -EINVAL; - } -#endif - profile_discard_flip_buffers(); - memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); - return count; -} - -static const struct file_operations proc_profile_operations = { - .read = read_profile, - .write = write_profile, - .llseek = default_llseek, -}; - -#ifdef CONFIG_SMP -static void profile_nop(void *unused) -{ -} - -static int create_hash_tables(void) -{ - int cpu; - - for_each_online_cpu(cpu) { - int node = cpu_to_mem(cpu); - struct page *page; - - page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[1] - = (struct profile_hit *)page_address(page); - page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[0] - = (struct profile_hit *)page_address(page); - } - return 0; -out_cleanup: - prof_on = 0; - smp_mb(); - on_each_cpu(profile_nop, NULL, 1); - for_each_online_cpu(cpu) { - struct page *page; - - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - } - return -1; -} -#else -#define create_hash_tables() ({ 0; }) -#endif - -int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ -{ - struct proc_dir_entry *entry; - - if (!prof_on) - return 0; - if (create_hash_tables()) - return -ENOMEM; - entry = proc_create("profile", S_IWUSR | S_IRUGO, - NULL, &proc_profile_operations); - if (!entry) - return 0; - entry->size = (1+prof_len) * sizeof(atomic_t); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; -} -module_init(create_proc_profile); -#endif /* CONFIG_PROC_FS */ -/* - * linux/kernel/ptrace.c - * - * (C) Copyright 1999 Linus Torvalds - * - * Common interfaces for "ptrace()" which we do not want - * to continually duplicate across every architecture. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static int ptrace_trapping_sleep_fn(void *flags) -{ - schedule(); - return 0; -} - -/* - * ptrace a task: make the debugger its new parent and - * move it to the ptrace list. - * - * Must be called with the tasklist lock write-held. - */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) -{ - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; -} - -/** - * __ptrace_unlink - unlink ptracee and restore its execution state - * @child: ptracee to be unlinked - * - * Remove @child from the ptrace list, move it back to the original parent, - * and restore the execution state so that it conforms to the group stop - * state. - * - * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer - * exiting. For PTRACE_DETACH, unless the ptracee has been killed between - * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. - * If the ptracer is exiting, the ptracee can be in any state. - * - * After detach, the ptracee should be in a state which conforms to the - * group stop. If the group is stopped or in the process of stopping, the - * ptracee should be put into TASK_STOPPED; otherwise, it should be woken - * up from TASK_TRACED. - * - * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, - * it goes through TRACED -> RUNNING -> STOPPED transition which is similar - * to but in the opposite direction of what happens while attaching to a - * stopped task. However, in this direction, the intermediate RUNNING - * state is not hidden even from the current ptracer and if it immediately - * re-attaches and performs a WNOHANG wait(2), it may fail. - * - * CONTEXT: - * write_lock_irq(tasklist_lock) - */ -void __ptrace_unlink(struct task_struct *child) -{ - BUG_ON(!child->ptrace); - - child->ptrace = 0; - child->parent = child->real_parent; - list_del_init(&child->ptrace_entry); - - spin_lock(&child->sighand->siglock); - - /* - * Clear all pending traps and TRAPPING. TRAPPING should be - * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. - */ - task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); - task_clear_jobctl_trapping(child); - - /* - * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and - * @child isn't dead. - */ - if (!(child->flags & PF_EXITING) && - (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { - child->jobctl |= JOBCTL_STOP_PENDING; - - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - - /* - * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick - * @child in the butt. Note that @resume should be used iff @child - * is in TASK_TRACED; otherwise, we might unduly disrupt - * TASK_KILLABLE sleeps. - */ - if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); - - spin_unlock(&child->sighand->siglock); -} - -/** - * ptrace_check_attach - check whether ptracee is ready for ptrace operation - * @child: ptracee to check for - * @ignore_state: don't check whether @child is currently %TASK_TRACED - * - * Check whether @child is being ptraced by %current and ready for further - * ptrace operations. If @ignore_state is %false, @child also should be in - * %TASK_TRACED state and on return the child is guaranteed to be traced - * and not executing. If @ignore_state is %true, @child can be in any - * state. - * - * CONTEXT: - * Grabs and releases tasklist_lock and @child->sighand->siglock. - * - * RETURNS: - * 0 on success, -ESRCH if %child is not ready. - */ -int ptrace_check_attach(struct task_struct *child, bool ignore_state) -{ - int ret = -ESRCH; - - /* - * We take the read lock around doing both checks to close a - * possible race where someone else was tracing our child and - * detached between these two checks. After this locked check, - * we are sure that this is our traced child and that can only - * be changed by us so it's not changing right after this. - */ - read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { - /* - * child->sighand can't be NULL, release_task() - * does ptrace_unlink() before __exit_signal(). - */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) - ret = 0; - spin_unlock_irq(&child->sighand->siglock); - } - read_unlock(&tasklist_lock); - - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; - - /* All systems go.. */ - return ret; -} - -static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) -{ - if (mode & PTRACE_MODE_NOAUDIT) - return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); - else - return has_ns_capability(current, ns, CAP_SYS_PTRACE); -} - -int __ptrace_may_access(struct task_struct *task, unsigned int mode) -{ - const struct cred *cred = current_cred(), *tcred; - - /* May we inspect the given task? - * This check is used both for attaching with ptrace - * and for allowing access to sensitive information in /proc. - * - * ptrace_attach denies several cases that /proc allows - * because setting up the necessary parent/child relationship - * or halting the specified task is impossible. - */ - int dumpable = 0; - /* Don't let security modules deny introspection */ - if (task == current) - return 0; - rcu_read_lock(); - tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) - goto ok; - if (ptrace_has_cap(tcred->user->user_ns, mode)) - goto ok; - rcu_read_unlock(); - return -EPERM; -ok: - rcu_read_unlock(); - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) - return -EPERM; - - return security_ptrace_access_check(task, mode); -} - -bool ptrace_may_access(struct task_struct *task, unsigned int mode) -{ - int err; - task_lock(task); - err = __ptrace_may_access(task, mode); - task_unlock(task); - return !err; -} - -static int ptrace_attach(struct task_struct *task, long request, - unsigned long flags) -{ - bool seize = (request == PTRACE_SEIZE); - int retval; - - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. - */ - retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; - - audit_ptrace(task); - - retval = -EPERM; - if (unlikely(task->flags & PF_KTHREAD)) - goto out; - if (same_thread_group(task, current)) - goto out; - - /* - * Protect exec's credential calculations against our interference; - * interference; SUID, SGID and LSM creds get determined differently - * under ptrace. - */ - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) - goto out; - - task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); - task_unlock(task); - if (retval) - goto unlock_creds; - - write_lock_irq(&tasklist_lock); - retval = -EPERM; - if (unlikely(task->exit_state)) - goto unlock_tasklist; - if (task->ptrace) - goto unlock_tasklist; - - task->ptrace = PT_PTRACED; - if (seize) - task->ptrace |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; - - __ptrace_link(task, current); - - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); - - spin_lock(&task->sighand->siglock); - - /* - * If the task is already STOPPED, set JOBCTL_TRAP_STOP and - * TRAPPING, and kick it so that it transits to TRACED. TRAPPING - * will be cleared if the child completes the transition or any - * event which clears the group stop states happens. We'll wait - * for the transition to complete before returning from this - * function. - * - * This hides STOPPED -> RUNNING -> TRACED transition from the - * attaching thread but a different thread in the same group can - * still observe the transient RUNNING state. IOW, if another - * thread's WNOHANG wait(2) on the stopped tracee races against - * ATTACH, the wait(2) may fail due to the transient RUNNING. - * - * The following task_is_stopped() test is safe as both transitions - * in and out of STOPPED are protected by siglock. - */ - if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); - - spin_unlock(&task->sighand->siglock); - - retval = 0; -unlock_tasklist: - write_unlock_irq(&tasklist_lock); -unlock_creds: - mutex_unlock(&task->signal->cred_guard_mutex); -out: - if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); - proc_ptrace_connector(task, PTRACE_ATTACH); - } - - return retval; -} - -/** - * ptrace_traceme -- helper for PTRACE_TRACEME - * - * Performs checks and sets PT_PTRACED. - * Should be used by all ptrace implementations for PTRACE_TRACEME. - */ -static int ptrace_traceme(void) -{ - int ret = -EPERM; - - write_lock_irq(&tasklist_lock); - /* Are we already being traced? */ - if (!current->ptrace) { - ret = security_ptrace_traceme(current->parent); - /* - * Check PF_EXITING to ensure ->real_parent has not passed - * exit_ptrace(). Otherwise we don't report the error but - * pretend ->real_parent untraces us right after return. - */ - if (!ret && !(current->real_parent->flags & PF_EXITING)) { - current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); - } - } - write_unlock_irq(&tasklist_lock); - - return ret; -} - -/* - * Called with irqs disabled, returns true if childs should reap themselves. - */ -static int ignoring_children(struct sighand_struct *sigh) -{ - int ret; - spin_lock(&sigh->siglock); - ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || - (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); - spin_unlock(&sigh->siglock); - return ret; -} - -/* - * Called with tasklist_lock held for writing. - * Unlink a traced task, and clean it up if it was a traced zombie. - * Return true if it needs to be reaped with release_task(). - * (We can't call release_task() here because we already hold tasklist_lock.) - * - * If it's a zombie, our attachedness prevented normal parent notification - * or self-reaping. Do notification now if it would have happened earlier. - * If it should reap itself, return true. - * - * If it's our own child, there is no notification to do. But if our normal - * children self-reap, then this child was prevented by ptrace and we must - * reap it now, in that case we must also wake up sub-threads sleeping in - * do_wait(). - */ -static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) -{ - bool dead; - - __ptrace_unlink(p); - - if (p->exit_state != EXIT_ZOMBIE) - return false; - - dead = !thread_group_leader(p); - - if (!dead && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); - dead = true; - } - } - /* Mark it as in the process of being reaped. */ - if (dead) - p->exit_state = EXIT_DEAD; - return dead; -} - -static int ptrace_detach(struct task_struct *child, unsigned int data) -{ - bool dead = false; - - if (!valid_signal(data)) - return -EIO; - - /* Architecture-specific hardware disable .. */ - ptrace_disable(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - - write_lock_irq(&tasklist_lock); - /* - * This child can be already killed. Make sure de_thread() or - * our sub-thread doing do_wait() didn't do release_task() yet. - */ - if (child->ptrace) { - child->exit_code = data; - dead = __ptrace_detach(current, child); - } - write_unlock_irq(&tasklist_lock); - - proc_ptrace_connector(child, PTRACE_DETACH); - if (unlikely(dead)) - release_task(child); - - return 0; -} - -/* - * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. - */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) -{ - struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; - - list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { - if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } - - write_lock_irq(&tasklist_lock); -} - -int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) -{ - int copied = 0; - - while (len > 0) { - char buf[128]; - int this_len, retval; - - this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); - if (!retval) { - if (copied) - break; - return -EIO; - } - if (copy_to_user(dst, buf, retval)) - return -EFAULT; - copied += retval; - src += retval; - dst += retval; - len -= retval; - } - return copied; -} - -int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) -{ - int copied = 0; - - while (len > 0) { - char buf[128]; - int this_len, retval; - - this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - if (copy_from_user(buf, src, this_len)) - return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); - if (!retval) { - if (copied) - break; - return -EIO; - } - copied += retval; - src += retval; - dst += retval; - len -= retval; - } - return copied; -} - -static int ptrace_setoptions(struct task_struct *child, unsigned long data) -{ - child->ptrace &= ~PT_TRACE_MASK; - - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; - - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; - - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; -} - -static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) -{ - unsigned long flags; - int error = -ESRCH; - - if (lock_task_sighand(child, &flags)) { - error = -EINVAL; - if (likely(child->last_siginfo != NULL)) { - *info = *child->last_siginfo; - error = 0; - } - unlock_task_sighand(child, &flags); - } - return error; -} - -static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) -{ - unsigned long flags; - int error = -ESRCH; - - if (lock_task_sighand(child, &flags)) { - error = -EINVAL; - if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = *info; - error = 0; - } - unlock_task_sighand(child, &flags); - } - return error; -} - - -#ifdef PTRACE_SINGLESTEP -#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif - -#ifdef PTRACE_SINGLEBLOCK -#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) -#else -#define is_singleblock(request) 0 -#endif - -#ifdef PTRACE_SYSEMU -#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) -#else -#define is_sysemu_singlestep(request) 0 -#endif - -static int ptrace_resume(struct task_struct *child, long request, - unsigned long data) -{ - if (!valid_signal(data)) - return -EIO; - - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - -#ifdef TIF_SYSCALL_EMU - if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -#endif - - if (is_singleblock(request)) { - if (unlikely(!arch_has_block_step())) - return -EIO; - user_enable_block_step(child); - } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { - if (unlikely(!arch_has_single_step())) - return -EIO; - user_enable_single_step(child); - } else { - user_disable_single_step(child); - } - - child->exit_code = data; - wake_up_state(child, __TASK_TRACED); - - return 0; -} - -#ifdef CONFIG_HAVE_ARCH_TRACEHOOK - -static const struct user_regset * -find_regset(const struct user_regset_view *view, unsigned int type) -{ - const struct user_regset *regset; - int n; - - for (n = 0; n < view->n; ++n) { - regset = view->regsets + n; - if (regset->core_note_type == type) - return regset; - } - - return NULL; -} - -static int ptrace_regset(struct task_struct *task, int req, unsigned int type, - struct iovec *kiov) -{ - const struct user_regset_view *view = task_user_regset_view(task); - const struct user_regset *regset = find_regset(view, type); - int regset_no; - - if (!regset || (kiov->iov_len % regset->size) != 0) - return -EINVAL; - - regset_no = regset - view->regsets; - kiov->iov_len = min(kiov->iov_len, - (__kernel_size_t) (regset->n * regset->size)); - - if (req == PTRACE_GETREGSET) - return copy_regset_to_user(task, view, regset_no, 0, - kiov->iov_len, kiov->iov_base); - else - return copy_regset_from_user(task, view, regset_no, 0, - kiov->iov_len, kiov->iov_base); -} - -#endif - -int ptrace_request(struct task_struct *child, long request, - unsigned long addr, unsigned long data) -{ - bool seized = child->ptrace & PT_SEIZED; - int ret = -EIO; - siginfo_t siginfo, *si; - void __user *datavp = (void __user *) data; - unsigned long __user *datalp = datavp; - unsigned long flags; - - switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - return generic_ptrace_peekdata(child, addr, data); - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - return generic_ptrace_pokedata(child, addr, data); - -#ifdef PTRACE_OLDSETOPTIONS - case PTRACE_OLDSETOPTIONS: -#endif - case PTRACE_SETOPTIONS: - ret = ptrace_setoptions(child, data); - break; - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, datalp); - break; - - case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, &siginfo); - if (!ret) - ret = copy_siginfo_to_user(datavp, &siginfo); - break; - - case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, datavp, sizeof siginfo)) - ret = -EFAULT; - else - ret = ptrace_setsiginfo(child, &siginfo); - break; - - case PTRACE_INTERRUPT: - /* - * Stop tracee without any side-effect on signal or job - * control. At least one trap is guaranteed to happen - * after this request. If @child is already trapped, the - * current trap is not disturbed and another trap will - * happen after the current trap is ended with PTRACE_CONT. - * - * The actual trap might not be PTRACE_EVENT_STOP trap but - * the pending condition is cleared regardless. - */ - if (unlikely(!seized || !lock_task_sighand(child, &flags))) - break; - - /* - * INTERRUPT doesn't disturb existing trap sans one - * exception. If ptracer issued LISTEN for the current - * STOP, this INTERRUPT should clear LISTEN and re-trap - * tracee into STOP. - */ - if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); - - unlock_task_sighand(child, &flags); - ret = 0; - break; - - case PTRACE_LISTEN: - /* - * Listen for events. Tracee must be in STOP. It's not - * resumed per-se but is not considered to be in TRACED by - * wait(2) or ptrace(2). If an async event (e.g. group - * stop state change) happens, tracee will enter STOP trap - * again. Alternatively, ptracer can issue INTERRUPT to - * finish listening and re-trap tracee into STOP. - */ - if (unlikely(!seized || !lock_task_sighand(child, &flags))) - break; - - si = child->last_siginfo; - if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { - child->jobctl |= JOBCTL_LISTENING; - /* - * If NOTIFY is set, it means event happened between - * start of this trap and now. Trigger re-trap. - */ - if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); - ret = 0; - } - unlock_task_sighand(child, &flags); - break; - - case PTRACE_DETACH: /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - -#ifdef CONFIG_BINFMT_ELF_FDPIC - case PTRACE_GETFDPIC: { - struct mm_struct *mm = get_task_mm(child); - unsigned long tmp = 0; - - ret = -ESRCH; - if (!mm) - break; - - switch (addr) { - case PTRACE_GETFDPIC_EXEC: - tmp = mm->context.exec_fdpic_loadmap; - break; - case PTRACE_GETFDPIC_INTERP: - tmp = mm->context.interp_fdpic_loadmap; - break; - default: - break; - } - mmput(mm); - - ret = put_user(tmp, datalp); - break; - } -#endif - -#ifdef PTRACE_SINGLESTEP - case PTRACE_SINGLESTEP: -#endif -#ifdef PTRACE_SINGLEBLOCK - case PTRACE_SINGLEBLOCK: -#endif -#ifdef PTRACE_SYSEMU - case PTRACE_SYSEMU: - case PTRACE_SYSEMU_SINGLESTEP: -#endif - case PTRACE_SYSCALL: - case PTRACE_CONT: - return ptrace_resume(child, request, data); - - case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); - -#ifdef CONFIG_HAVE_ARCH_TRACEHOOK - case PTRACE_GETREGSET: - case PTRACE_SETREGSET: - { - struct iovec kiov; - struct iovec __user *uiov = datavp; - - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) - return -EFAULT; - - if (__get_user(kiov.iov_base, &uiov->iov_base) || - __get_user(kiov.iov_len, &uiov->iov_len)) - return -EFAULT; - - ret = ptrace_regset(child, request, addr, &kiov); - if (!ret) - ret = __put_user(kiov.iov_len, &uiov->iov_len); - break; - } -#endif - default: - break; - } - - return ret; -} - -static struct task_struct *ptrace_get_task_struct(pid_t pid) -{ - struct task_struct *child; - - rcu_read_lock(); - child = find_task_by_vpid(pid); - if (child) - get_task_struct(child); - rcu_read_unlock(); - - if (!child) - return ERR_PTR(-ESRCH); - return child; -} - -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - -SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, - unsigned long, data) -{ - struct task_struct *child; - long ret; - - if (request == PTRACE_TRACEME) { - ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); - goto out; - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); - goto out_put_task_struct; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL || - request == PTRACE_INTERRUPT); - if (ret < 0) - goto out_put_task_struct; - - ret = arch_ptrace(child, request, addr, data); - - out_put_task_struct: - put_task_struct(child); - out: - return ret; -} - -int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, - unsigned long data) -{ - unsigned long tmp; - int copied; - - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); - if (copied != sizeof(tmp)) - return -EIO; - return put_user(tmp, (unsigned long __user *)data); -} - -int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, - unsigned long data) -{ - int copied; - - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); - return (copied == sizeof(data)) ? 0 : -EIO; -} - -#if defined CONFIG_COMPAT -#include - -int compat_ptrace_request(struct task_struct *child, compat_long_t request, - compat_ulong_t addr, compat_ulong_t data) -{ - compat_ulong_t __user *datap = compat_ptr(data); - compat_ulong_t word; - siginfo_t siginfo; - int ret; - - switch (request) { - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); - if (ret != sizeof(word)) - ret = -EIO; - else - ret = put_user(word, datap); - break; - - case PTRACE_POKETEXT: - case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); - ret = (ret != sizeof(data) ? -EIO : 0); - break; - - case PTRACE_GETEVENTMSG: - ret = put_user((compat_ulong_t) child->ptrace_message, datap); - break; - - case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, &siginfo); - if (!ret) - ret = copy_siginfo_to_user32( - (struct compat_siginfo __user *) datap, - &siginfo); - break; - - case PTRACE_SETSIGINFO: - memset(&siginfo, 0, sizeof siginfo); - if (copy_siginfo_from_user32( - &siginfo, (struct compat_siginfo __user *) datap)) - ret = -EFAULT; - else - ret = ptrace_setsiginfo(child, &siginfo); - break; -#ifdef CONFIG_HAVE_ARCH_TRACEHOOK - case PTRACE_GETREGSET: - case PTRACE_SETREGSET: - { - struct iovec kiov; - struct compat_iovec __user *uiov = - (struct compat_iovec __user *) datap; - compat_uptr_t ptr; - compat_size_t len; - - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) - return -EFAULT; - - if (__get_user(ptr, &uiov->iov_base) || - __get_user(len, &uiov->iov_len)) - return -EFAULT; - - kiov.iov_base = compat_ptr(ptr); - kiov.iov_len = len; - - ret = ptrace_regset(child, request, addr, &kiov); - if (!ret) - ret = __put_user(kiov.iov_len, &uiov->iov_len); - break; - } -#endif - - default: - ret = ptrace_request(child, request, addr, data); - } - - return ret; -} - -asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data) -{ - struct task_struct *child; - long ret; - - if (request == PTRACE_TRACEME) { - ret = ptrace_traceme(); - goto out; - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); - goto out_put_task_struct; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL || - request == PTRACE_INTERRUPT); - if (!ret) - ret = compat_arch_ptrace(child, request, addr, data); - - out_put_task_struct: - put_task_struct(child); - out: - return ret; -} -#endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -/* - * Range add and subtract - */ -#include -#include -#include - -#include - -int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) -{ - if (start >= end) - return nr_range; - - /* Out of slots: */ - if (nr_range >= az) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -int add_range_with_merge(struct range *range, int az, int nr_range, - u64 start, u64 end) -{ - int i; - - if (start >= end) - return nr_range; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; - u64 common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, az, nr_range, start, end); -} - -void subtract_range(struct range *range, int az, u64 start, u64 end) -{ - int i, j; - - if (start >= end) - return; - - for (j = 0; j < az; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end) { - range[j].start = end; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start) { - range[j].end = start; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < az; i++) { - if (range[i].end == 0) - break; - } - if (i < az) { - range[i].end = range[j].end; - range[i].start = end; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start; - continue; - } - } -} - -static int cmp_range(const void *x1, const void *x2) -{ - const struct range *r1 = x1; - const struct range *r2 = x2; - s64 start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -int clean_sort_range(struct range *range, int az) -{ - int i, j, k = az - 1, nr_range = az; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct range), cmp_range, NULL); - - return nr_range; -} - -void sort_range(struct range *range, int nr_range) -{ - /* sort them */ - sort(range, nr_range, sizeof(struct range), cmp_range, NULL); -} -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#include "rcu.h" - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); - -static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); -EXPORT_SYMBOL_GPL(rcu_bh_lock_map); - -static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); -EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int debug_lockdep_rcu_enabled(void) -{ - return rcu_scheduler_active && debug_locks && - current->lockdep_recursion == 0; -} -EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); - -/** - * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? - * - * Check for bottom half being disabled, which covers both the - * CONFIG_PROVE_RCU and not cases. Note that if someone uses - * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. This is useful for debug checks in functions - * that require that they be called within an RCU read-side critical - * section. - * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. - */ -int rcu_read_lock_bh_held(void) -{ - if (!debug_lockdep_rcu_enabled()) - return 1; - if (rcu_is_cpu_idle()) - return 0; - return in_softirq() || irqs_disabled(); -} -EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); - -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_rcu() instance now that a - * grace period has elapsed. - */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -void wait_rcu_gp(call_rcu_func_t crf) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(wait_rcu_gp); - -#ifdef CONFIG_PROVE_RCU -/* - * wrapper function to avoid #include problems. - */ -int rcu_my_thread_group_empty(void) -{ - return thread_group_empty(current); -} -EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); -#endif /* #ifdef CONFIG_PROVE_RCU */ - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static inline void debug_init_rcu_head(struct rcu_head *head) -{ - debug_object_init(head, &rcuhead_debug_descr); -} - -static inline void debug_rcu_head_free(struct rcu_head *head) -{ - debug_object_free(head, &rcuhead_debug_descr); -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_init(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return 0; - - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_activate(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_free(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - -/** - * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects - * @head: pointer to rcu_head structure to be initialized - * - * This function informs debugobjects of a new rcu_head structure that - * has been allocated as an auto variable on the stack. This function - * is not required for rcu_head structures that are statically defined or - * that are dynamically allocated on the heap. This function has no - * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. - */ -void init_rcu_head_on_stack(struct rcu_head *head) -{ - debug_object_init_on_stack(head, &rcuhead_debug_descr); -} -EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); - -/** - * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects - * @head: pointer to rcu_head structure to be initialized - * - * This function informs debugobjects that an on-stack rcu_head structure - * is about to go out of scope. As with init_rcu_head_on_stack(), this - * function is not required for rcu_head structures that are statically - * defined or that are dynamically allocated on the heap. Also as with - * init_rcu_head_on_stack(), this function has no effect for - * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. - */ -void destroy_rcu_head_on_stack(struct rcu_head *head) -{ - debug_object_free(head, &rcuhead_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); - -struct debug_obj_descr rcuhead_debug_descr = { - .name = "rcu_head", - .fixup_init = rcuhead_fixup_init, - .fixup_activate = rcuhead_fixup_activate, - .fixup_free = rcuhead_fixup_free, -}; -EXPORT_SYMBOL_GPL(rcuhead_debug_descr); -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) -{ - trace_rcu_torture_read(rcutorturename, rhp); -} -EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) -#endif -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Author: Paul E. McKenney - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_RCU_TRACE -#include -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -#include "rcu.h" - -/* Forward declarations for rcutiny_plugin.h. */ -struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void); -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_ctrlblk *rcp); - -#include "rcutiny_plugin.h" - -static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; - -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long oldval) -{ - if (rcu_dynticks_nesting) { - RCU_TRACE(trace_rcu_dyntick("--=", - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ -} - -/* - * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). - */ -void rcu_idle_enter(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting = 0; - rcu_idle_enter_common(oldval); - local_irq_restore(flags); -} - -/* - * Exit an interrupt handler towards idle. - */ -void rcu_irq_exit(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting--; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(oldval); - local_irq_restore(flags); -} - -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/* - * Exit idle, so that we are no longer in an extended quiescent state. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = DYNTICK_TASK_NESTING; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} - -/* - * Enter an interrupt handler, moving away from idle. - */ -void rcu_irq_enter(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} - -#ifdef CONFIG_PROVE_RCU - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -int rcu_is_cpu_idle(void) -{ - return !rcu_dynticks_nesting; -} -EXPORT_SYMBOL(rcu_is_cpu_idle); - -#endif /* #ifdef CONFIG_PROVE_RCU */ - -/* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 0; -} - -/* - * Helper function for rcu_sched_qs() and rcu_bh_qs(). - * Also irqs are disabled to avoid confusion due to interrupt handlers - * invoking call_rcu(). - */ -static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) -{ - if (rcp->rcucblist != NULL && - rcp->donetail != rcp->curtail) { - rcp->donetail = rcp->curtail; - return 1; - } - - return 0; -} - -/* - * Record an rcu quiescent state. And an rcu_bh quiescent state while we - * are at it, given that any rcu quiescent state is also an rcu_bh - * quiescent state. Use "+" instead of "||" to defeat short circuiting. - */ -void rcu_sched_qs(int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_sched_ctrlblk) + - rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); - local_irq_restore(flags); -} - -/* - * Record an rcu_bh quiescent state. - */ -void rcu_bh_qs(int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); - local_irq_restore(flags); -} - -/* - * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. This function must - * be called from hardirq context. It is normally called from the - * scheduling-clock interrupt. - */ -void rcu_check_callbacks(int cpu, int user) -{ - if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); - else if (!in_softirq()) - rcu_bh_qs(cpu); - rcu_preempt_check_callbacks(); -} - -/* - * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure - * whose grace period has elapsed. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) -{ - char *rn = NULL; - struct rcu_head *next, *list; - unsigned long flags; - RCU_TRACE(int cb_count = 0); - - /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - ACCESS_ONCE(rcp->rcucblist), - need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread())); - return; - } - - /* Move the ready-to-invoke callbacks to a local list. */ - local_irq_save(flags); - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); - list = rcp->rcucblist; - rcp->rcucblist = *rcp->donetail; - *rcp->donetail = NULL; - if (rcp->curtail == rcp->donetail) - rcp->curtail = &rcp->rcucblist; - rcu_preempt_remove_callbacks(rcp); - rcp->donetail = &rcp->rcucblist; - local_irq_restore(flags); - - /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - local_bh_disable(); - __rcu_reclaim(rn, list); - local_bh_enable(); - list = next; - RCU_TRACE(cb_count++); - } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread())); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); -} - -/* - * Wait for a grace period to elapse. But it is illegal to invoke - * synchronize_sched() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_sched() is a quiescent - * state, and so on a UP system, synchronize_sched() need do nothing. - * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the - * benefits of doing might_sleep() to reduce latency.) - * - * Cool, huh? (Due to Josh Triplett.) - * - * But we want to make this a static inline later. The cond_resched() - * currently makes this problematic. - */ -void synchronize_sched(void) -{ - cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/* - * Helper function for call_rcu() and call_rcu_bh(). - */ -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcp->curtail = head; - rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); - local_irq_restore(flags); -} - -/* - * Post an RCU callback to be invoked after the end of an RCU-sched grace - * period. But since we have but one CPU, that would be after any - * quiescent state. - */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Post an RCU bottom-half callback to be invoked after any subsequent - * quiescent state. - */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_bh_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney - * Josh Triplett - * - * See also: Documentation/RCU/torture.txt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and " - "Josh Triplett "); - -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval; /* Interval between stats, in seconds. */ - /* Defaults to "only at end of test". */ -static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ -static int stutter = 5; /* Start/stop testing interval (in sec) */ -static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff; /* Hold time within burst (us). */ -static int fqs_stutter = 3; /* Wait time between bursts (s). */ -static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ -static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ -static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ -static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ -static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static char printk_buf[4096]; - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; -static struct task_struct *fqs_task; -static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture __rcu *rcu_torture_current; -static unsigned long rcu_torture_current_version; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_failure; -static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static long n_online_attempts; -static long n_online_successes; -static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; - -static int stutter_pause_test; - -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); - -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ - -static unsigned long shutdown_time; /* jiffies to system shutdown. */ -static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ - /* and boost task create/destroy. */ - -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - printk(KERN_WARNING /* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - printk(KERN_NOTICE - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - void (*cleanup)(void); - int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferred_free)(struct rcu_torture *p); - void (*sync)(void); - void (*cb_barrier)(void); - void (*fqs)(void); - int (*stats)(char *page); - int irq_capable; - int can_boost; - char *name; -}; - -static struct rcu_torture_ops *cur_ops; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; - - /* We want a short delay sometimes to make a reader delay the grace - * period, and we want a long delay occasionally to trigger - * force_quiescent_state. */ - - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) - udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop != FULLSTOP_DONTSTOP) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - cur_ops->deferred_free(rp); -} - -static int rcu_no_completed(void) -{ - return 0; -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = NULL, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu_sync" -}; - -static struct rcu_torture_ops rcu_expedited_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_expedited, - .cb_barrier = NULL, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu_expedited" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh" -}; - -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_sync" -}; - -static struct rcu_torture_ops rcu_bh_expedited_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh_expedited, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_expedited" -}; - -/* - * Definitions for srcu torture testing. - */ - -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static int srcu_torture_stats(char *page) -{ - int cnt = 0; - int cpu; - int idx = srcu_ctl.completed & 0x1; - - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - cnt += sprintf(&page[cnt], "\n"); - return cnt; -} - -static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_expedited_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize_expedited, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_expedited" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched" -}; - -static struct rcu_torture_ops sched_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .name = "sched_sync" -}; - -static struct rcu_torture_ops sched_expedited_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched_expedited, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched_expedited" -}; - -/* - * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. - */ - -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; -}; - -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); - - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; -} - -static int rcu_torture_boost(void *arg) -{ - unsigned long call_rcu_time; - unsigned long endtime; - unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; - - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); - - /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } - - init_rcu_head_on_stack(&rbi.rcu); - /* Each pass through the following loop does one boost-test cycle. */ - do { - /* Wait for the next test interval. */ - oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_uninterruptible(1); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* Do one boost-test interval. */ - endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; - call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } - call_rcu_time = jiffies; - } - cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* - * Set the start time of the next test interval. - * Yes, this is vulnerable to long delays, but such - * delays simply cause a false negative for the next - * interval. Besides, we are running at RT priority, - * so delays should be relatively rare. - */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } - - /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) - schedule_timeout_uninterruptible(1); - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ - destroy_rcu_head_on_stack(&rbi.rcu); - return 0; -} - -/* - * RCU torture force-quiescent-state kthread. Repeatedly induces - * bursts of calls to force_quiescent_state(), increasing the probability - * of occurrence of some important types of race conditions. - */ -static int -rcu_torture_fqs(void *arg) -{ - unsigned long fqs_resume_time; - int fqs_burst_remaining; - - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); - do { - fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && - !kthread_should_stop()) { - schedule_timeout_interruptible(1); - } - fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0 && - !kthread_should_stop()) { - cur_ops->fqs(); - udelay(fqs_holdoff); - fqs_burst_remaining -= fqs_holdoff; - } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - int i; - long oldbatch = rcu_batches_completed(); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - rp = rcu_torture_alloc(); - if (rp == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_dereference_check(rcu_torture_current, - current == writer_task); - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - cur_ops->deferred_free(old_rp); - } - rcutorture_record_progress(++rcu_torture_current_version); - oldbatch = cur_ops->completed(); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); - ftrace_dump(DUMP_ALL); -} - -/* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static void rcu_torture_timer(unsigned long unused) -{ - int idx; - int completed; - static DEFINE_RCU_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); - struct rcu_torture *p; - int pipe_count; - - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - if (pipe_count > 1) - rcutorture_trace_dump(); - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - struct timer_list t; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); - - do { - if (irqreader && cur_ops->irq_capable) { - if (!timer_pending(&t)) - mod_timer(&t, jiffies + 1); - } - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - if (pipe_count > 1) - rcutorture_trace_dump(); - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irq_capable) - del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static int -rcu_torture_printk(char *page) -{ - int cnt = 0; - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror, - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers, - n_online_successes, - n_online_attempts, - n_offline_successes, - n_offline_attempts); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { - cnt += sprintf(&page[cnt], "!!! "); - atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); - } - cnt += sprintf(&page[cnt], "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", - atomic_read(&rcu_torture_wcount[i])); - } - cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int cnt; - - cnt = rcu_torture_printk(printk_buf); - printk(KERN_ALERT "%s", printk_buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) -{ - printk(KERN_ALERT "%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval); -} - -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - -static void rcutorture_booster_cleanup(int cpu) -{ - struct task_struct *t; - - if (boost_tasks[cpu] == NULL) - return; - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); - t = boost_tasks[cpu]; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - - /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); -} - -static int rcutorture_booster_init(int cpu) -{ - int retval; - - if (boost_tasks[cpu] != NULL) - return 0; /* Already created, nothing more to do. */ - - /* Don't allow time recalculation while creating a new task. */ - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); - if (IS_ERR(boost_tasks[cpu])) { - retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); - n_rcu_torture_boost_ktrerror++; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - return retval; - } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); - mutex_unlock(&boost_mutex); - return 0; -} - -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int __cpuinit -rcu_torture_onoff(void *arg) -{ - int cpu; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - n_offline_attempts++; - if (cpu_down(cpu) == 0) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", - torture_type, cpu); - n_offline_successes++; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - n_online_attempts++; - if (cpu_up(cpu) == 0) { - if (verbose) - printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", - torture_type, cpu); - n_online_successes++; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int __cpuinit -rcu_torture_onoff_init(void) -{ - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - onoff_task = NULL; - return PTR_ERR(onoff_task); - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void -rcu_torture_onoff_init(void) -{ -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; - -static void -rcu_torture_cleanup(void) -{ - int i; - - mutex_lock(&fullstop_mutex); - rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - printk(KERN_WARNING /* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - return; - } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - rcu_torture_onoff_cleanup(); - - /* Wait for all RCU callbacks to fire. */ - - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (cur_ops->cleanup) - cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) - rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else - rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, - &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; - - mutex_lock(&fullstop_mutex); - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - printk(KERN_ALERT "rcu-torture types:"); - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - printk(KERN_ALERT " %s", torture_ops[i]->name); - printk(KERN_ALERT "\n"); - mutex_unlock(&fullstop_mutex); - return -EINVAL; - } - if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " - "fqs_duration, fqs disabled.\n"); - fqs_duration = 0; - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_failure = 0; - n_rcu_torture_boosts = 0; - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_run(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); - goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - if (stutter < 0) - stutter = 0; - if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - goto unwind; - } - } - if (fqs_duration < 0) - fqs_duration = 0; - if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; - goto unwind; - } - } - if (test_boost_interval < 1) - test_boost_interval = 1; - if (test_boost_duration < 2) - test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - int retval; - - boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; - goto unwind; - } - } - } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_run(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - } - rcu_torture_onoff_init(); - register_reboot_notifier(&rcutorture_shutdown_nb); - rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); - return 0; - -unwind: - mutex_unlock(&fullstop_mutex); - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * Paul E. McKenney Hierarchical version - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcutree.h" -#include - -#include "rcu.h" - -/* Data structures. */ - -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; - -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ - .fqs_state = RCU_GP_IDLE, \ - .gpnum = -300, \ - .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ - .name = #structname, \ -} - -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); -DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); - -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); - -static struct rcu_state *rcu_state; - -/* - * The rcu_scheduler_active variable transitions from zero to one just - * before the first task is spawned. So when this variable is zero, RCU - * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier(). When this variable - * is one, RCU must actually do all the hard work required to detect real - * grace periods. This variable is also used to suppress boot-time false - * positives from lockdep-RCU error checking. - */ -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * The rcu_scheduler_fully_active variable transitions from zero to one - * during the early_initcall() processing, which is after the scheduler - * is capable of creating new tasks. So RCU processing (for example, - * creating tasks for RCU priority boosting) must be delayed until after - * rcu_scheduler_fully_active transitions from zero to one. We also - * currently delay invocation of any RCU callbacks until after this point. - * - * It might later prove better for people registering RCU callbacks during - * early boot to take responsibility for these callbacks, but one step at - * a time. - */ -static int rcu_scheduler_fully_active __read_mostly; - -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); -static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); - -/* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - -/* - * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s - * permit this function to be invoked without holding the root rcu_node - * structure's ->lock, but of course results can be subject to change. - */ -static int rcu_gp_in_progress(struct rcu_state *rsp) -{ - return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); -} - -/* - * Note a quiescent state. Because we do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period, this just sets a flag. - * The caller must have disabled preemption. - */ -void rcu_sched_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); - rdp->passed_quiesce = 1; -} - -void rcu_bh_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); - rdp->passed_quiesce = 1; -} - -/* - * Note a context switch. This is a quiescent state for RCU-sched, - * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. - */ -void rcu_note_context_switch(int cpu) -{ - trace_rcu_utilization("Start context switch"); - rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); - trace_rcu_utilization("End context switch"); -} -EXPORT_SYMBOL_GPL(rcu_note_context_switch); - -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_NESTING, - .dynticks = ATOMIC_INIT(1), -}; - -static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static int qhimark = 10000; /* If this many pending, ignore blimit. */ -static int qlowmark = 100; /* Once only this many pending, use blimit. */ - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); - -int rcu_cpu_stall_suppress __read_mostly; -module_param(rcu_cpu_stall_suppress, int, 0644); - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed); -static int rcu_pending(int cpu); - -/* - * Return the number of RCU-sched batches processed thus far for debug & stats. - */ -long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches processed thus far for debug & stats. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* - * Force a quiescent state for RCU BH. - */ -void rcu_bh_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_bh_state, 0); -} -EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); - -/* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - -/* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state, 0); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - -/* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; -} - -/* - * Does the current CPU require a yet-as-unscheduled grace period? - */ -static int -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); -} - -/* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - -#ifdef CONFIG_SMP - -/* - * If the specified CPU is offline, tell the caller that it is in - * a quiescent state. Otherwise, whack it with a reschedule IPI. - * Grace periods can end up waiting on an offline CPU when that - * CPU is in the process of coming online -- it will be added to the - * rcu_node bitmasks before it actually makes it online. The same thing - * can happen while a CPU is in the process of coming online. Because this - * race is quite rare, we check for it after detecting that the grace - * period has been delayed rather than checking each and every CPU - * each and every time we start a new grace period. - */ -static int rcu_implicit_offline_qs(struct rcu_data *rdp) -{ - /* - * If the CPU is offline, it is in a quiescent state. We can - * trust its state not to change because interrupts are disabled. - */ - if (cpu_is_offline(rdp->cpu)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); - rdp->offline_fqs++; - return 1; - } - - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ - if (rdp->cpu != smp_processor_id()) - smp_send_reschedule(rdp->cpu); - else - set_need_resched(); - rdp->resched_ipi++; - return 0; -} - -#endif /* #ifdef CONFIG_SMP */ - -/* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle - * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) -{ - trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_prepare_for_idle(smp_processor_id()); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); -} - -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -void rcu_idle_enter(void) -{ - unsigned long flags; - long long oldval; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting = 0; - rcu_idle_enter_common(rdtp, oldval); - local_irq_restore(flags); -} - -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - */ -void rcu_irq_exit(void) -{ - unsigned long flags; - long long oldval; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); - else - rcu_idle_enter_common(rdtp, oldval); - local_irq_restore(flags); -} - -/* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) -{ - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - trace_rcu_dyntick("Error on exit: not idle task", - oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; - rcu_idle_exit_common(rdtp, oldval); - local_irq_restore(flags); -} - -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - */ -void rcu_irq_enter(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); - else - rcu_idle_exit_common(rdtp, oldval); - local_irq_restore(flags); -} - -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. - */ -void rcu_nmi_enter(void) -{ - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); -} - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. - */ -void rcu_nmi_exit(void) -{ - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) - return; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); -} - -#ifdef CONFIG_PROVE_RCU - -/** - * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle - * - * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. - */ -int rcu_is_cpu_idle(void) -{ - int ret; - - preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(rcu_is_cpu_idle); - -#endif /* #ifdef CONFIG_PROVE_RCU */ - -/** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle - * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. - */ -int rcu_is_cpu_rrupt_from_idle(void) -{ - return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; -} - -#ifdef CONFIG_SMP - -/* - * Snapshot the specified CPU's dynticks counter so that we can later - * credit them with an implicit quiescent state. Return 1 if this CPU - * is in dynticks idle mode, which is an extended quiescent state. - */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) -{ - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return (rdp->dynticks_snap & 0x1) == 0; -} - -/* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU. - */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) -{ - unsigned int curr; - unsigned int snap; - - curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned int)rdp->dynticks_snap; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq/NMI handlers, then we can safely pretend that the CPU - * already acknowledged the request to pass through a quiescent - * state. Either way, that CPU cannot possibly be in an RCU - * read-side critical section that started before the beginning - * of the current RCU grace period. - */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); - rdp->dynticks_fqs++; - return 1; - } - - /* Go check for the CPU being offline. */ - return rcu_implicit_offline_qs(rdp); -} - -#endif /* #ifdef CONFIG_SMP */ - -static void record_gp_stall_check_time(struct rcu_state *rsp) -{ - rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_state *rsp) -{ - int cpu; - long delta; - unsigned long flags; - int ndetected; - struct rcu_node *rnp = rcu_get_root(rsp); - - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - ndetected = rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", - rsp->name); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rnp->qsmask == 0) - continue; - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - printk(" %d", rnp->grplo + cpu); - ndetected++; - } - } - printk("} (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start)); - if (ndetected == 0) - printk(KERN_ERR "INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) - dump_stack(); - - /* If so configured, complain about tasks blocking the grace period. */ - - rcu_print_detail_task_stall(rsp); - - force_quiescent_state(rsp, 0); /* Kick them all. */ -} - -static void print_cpu_stall(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", - rsp->name, smp_processor_id(), jiffies - rsp->gp_start); - if (!trigger_all_cpu_backtrace()) - dump_stack(); - - raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long j; - unsigned long js; - struct rcu_node *rnp; - - if (rcu_cpu_stall_suppress) - return; - j = ACCESS_ONCE(jiffies); - js = ACCESS_ONCE(rsp->jiffies_stall); - rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rsp); - - } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); - } -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} - -/* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - if (rnp->qsmask & rdp->grpmask) { - rdp->qs_pending = 1; - rdp->passed_quiesce = 0; - } else - rdp->qs_pending = 0; - } -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_new_gpnum(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. - */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Did another grace period end? */ - if (rdp->completed != rnp->completed) { - - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); - - /* - * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs handled on - * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. If another grace period - * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. - */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) - rdp->gpnum = rdp->completed; - - /* - * If RCU does not need a quiescent state from this CPU, - * then make sure that this CPU doesn't go looking for one. - */ - if ((rnp->qsmask & rdp->grpmask) == 0) - rdp->qs_pending = 0; - } -} - -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __rcu_process_gp_end(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Do per-CPU grace-period initialization for running CPU. The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Prior grace period ended, so advance callbacks for current CPU. */ - __rcu_process_gp_end(rsp, rnp, rdp); - - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - /* Set state so that this CPU will detect the next quiescent state. */ - __note_new_gpnum(rsp, rnp, rdp); -} - -/* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. - */ -static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - if (!rcu_scheduler_fully_active || - !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either the scheduler hasn't yet spawned the first - * non-idle task or this CPU does not need another - * grace period. Either way, don't start a new grace - * period. - */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - if (rsp->fqs_active) { - /* - * This CPU needs a grace period, but force_quiescent_state() - * is running. Tell it to start one on this CPU's behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Advance to a new grace period and initialize state. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - record_gp_stall_check_time(rsp); - - /* Special-case the common single-level case. */ - if (NUM_RCU_NODES == 1) { - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - - - /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - - /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. This - * operation relies on the layout of the hierarchy within the - * rsp->node[] array. Note that other CPUs will access only - * the leaves of the hierarchy, which still indicate that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. - * - * Note that the grace period cannot complete until we finish - * the initialization process, as there will be at least one - * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU, due to the fact that we have - * irqs disabled. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); -} - -/* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. - */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) -{ - unsigned long gp_duration; - struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. - */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - - /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->gpnum; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - } - - rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ -} - -/* - * Similar to rcu_report_qs_rdp(), for which it is a helper function. - * Allows quiescent states for a group of CPUs to be reported at one go - * to the specified rcu_node structure, though all the CPUs in the group - * must be represented by the same rcu_node structure (which need not be - * a leaf rcu_node structure, though it often will be). That structure's - * lock must be held upon entry, and it is released before return. - */ -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - struct rcu_node *rnp_c; - - /* Walk up the rcu_node hierarchy. */ - for (;;) { - if (!(rnp->qsmask & mask)) { - - /* Our bit has already been cleared, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, - mask, rnp->qsmask, rnp->level, - rnp->grplo, rnp->grphi, - !!rnp->gp_tasks); - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - - /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - mask = rnp->grpmask; - if (rnp->parent == NULL) { - - /* No more levels. Exit loop holding root lock. */ - - break; - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rnp_c = rnp; - rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - WARN_ON_ONCE(rnp_c->qsmask); - } - - /* - * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Invoke rcu_report_qs_rsp() - * to clean up and start the next grace period if one is needed. - */ - rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ -} - -/* - * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! - */ -static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp; - - rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { - - /* - * The grace period in which this quiescent state was - * recorded has ended, so don't report it upwards. - * We will instead need a new quiescent state that lies - * within the current grace period. - */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - mask = rdp->grpmask; - if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else { - rdp->qs_pending = 0; - - /* - * This GP can't end until cpu checks in, so all of our - * callbacks can be processed during the next GP. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ - } -} - -/* - * Check to see if there is a new grace period of which this CPU - * is not yet aware, and if so, set up local rcu_data state for it. - * Otherwise, see if this CPU has just passed through its first - * quiescent state for this grace period, and record that fact if so. - */ -static void -rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) -{ - /* If there is now a new grace period, record and return. */ - if (check_for_new_grace_period(rsp, rdp)) - return; - - /* - * Does this CPU still need to do its part for current grace period? - * If no, return and let the other CPUs do their part as well. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesce) - return; - - /* - * Tell RCU we are done (but rcu_report_qs_rdp() will be the - * judge of that). - */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. - */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) -{ - int i; - /* current DYING CPU is cleared in the cpu_online_mask */ - int receive_cpu = cpumask_any(cpu_online_mask); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - - if (rdp->nxtlist == NULL) - return; /* irqs disabled, so comparison is stable. */ - - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen += rdp->qlen; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; -} - -/* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. - */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - int need_report = 0; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp; - - rcu_stop_cpu_kthread(cpu); - - /* Exclude any attempts to start a new grace period. */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - else - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); - break; - } - if (rnp == rdp->mynode) { - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - } else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - rcu_node_kthread_setaffinity(rnp, -1); -} - -/* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU. This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. - */ -static void rcu_offline_cpu(int cpu) -{ - __rcu_offline_cpu(cpu, &rcu_sched_state); - __rcu_offline_cpu(cpu, &rcu_bh_state); - rcu_preempt_offline_cpu(cpu); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_send_cbs_to_online(struct rcu_state *rsp) -{ -} - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Invoke any RCU callbacks that have made it to the end of their grace - * period. Thottle as specified by rdp->blimit. - */ -static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list, **tail; - int bl, count; - - /* If no callbacks are ready, just return.*/ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, 0, 0); - trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), - need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); - return; - } - - /* - * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. - */ - local_irq_save(flags); - bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; - local_irq_restore(flags); - - /* Invoke callbacks. */ - count = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - __rcu_reclaim(rsp->name, list); - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) - break; - } - - local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); - - /* Update count, and requeue any remaining callbacks. */ - rdp->qlen -= count; - rdp->n_cbs_invoked += count; - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; - else - break; - } - - /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { - rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - - local_irq_restore(flags); - - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_core(); -} - -/* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). - */ -void rcu_check_callbacks(int cpu, int user) -{ - trace_rcu_utilization("Start scheduler-tick"); - if (user || rcu_is_cpu_rrupt_from_idle()) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. - * - * No memory barrier is required here because both - * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local - * variables that other CPUs neither access nor modify, - * at least not while the corresponding CPU is online. - */ - - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so note it. - */ - - rcu_bh_qs(cpu); - } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) - invoke_rcu_core(); - trace_rcu_utilization("End scheduler-tick"); -} - -#ifdef CONFIG_SMP - -/* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. - */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) -{ - unsigned long bit; - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rsp, rnp) { - mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - if (!rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - continue; - } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu))) - mask |= bit; - } - if (mask != 0) { - - /* rcu_report_qs_rnp() releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, flags); - continue; - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } -} - -/* - * Force quiescent states on reluctant CPUs, and also detect which - * CPUs are in dyntick-idle mode. - */ -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); - - trace_rcu_utilization("Start fqs"); - if (!rcu_gp_in_progress(rsp)) { - trace_rcu_utilization("End fqs"); - return; /* No grace period in progress, nothing to force. */ - } - if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { - rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - trace_rcu_utilization("End fqs"); - return; /* Someone else is already on the job. */ - } - if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) - goto unlock_fqs_ret; /* no emergency and done recently. */ - rsp->n_force_qs++; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if(!rcu_gp_in_progress(rsp)) { - rsp->n_force_qs_ngp++; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - goto unlock_fqs_ret; /* no GP in progress, time updated. */ - } - rsp->fqs_active = 1; - switch (rsp->fqs_state) { - case RCU_GP_IDLE: - case RCU_GP_INIT: - - break; /* grace period idle or initializing, ignore. */ - - case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - - /* Record dyntick-idle state. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; - break; - - case RCU_FORCE_QS: - - /* Check dyntick-idle state, send IPI to laggarts. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - - /* Leave state in case more forcing is required. */ - - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - break; - } - rsp->fqs_active = 0; - if (rsp->fqs_need_gp) { - raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ - rsp->fqs_need_gp = 0; - rcu_start_gp(rsp, flags); /* releases rnp->lock */ - trace_rcu_utilization("End fqs"); - return; - } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -unlock_fqs_ret: - raw_spin_unlock_irqrestore(&rsp->fqslock, flags); - trace_rcu_utilization("End fqs"); -} - -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ - -/* - * This does the RCU core processing work for the specified rcu_state - * and rcu_data structures. This may be called only from the CPU to - * whom the rdp belongs. - */ -static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - - WARN_ON_ONCE(rdp->beenonline == 0); - - /* - * If an RCU GP has gone long enough, go check for dyntick - * idle CPUs and, if needed, send resched IPIs. - */ - if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - - /* - * Advance callbacks in response to end of earlier grace - * period that some other CPU ended. - */ - rcu_process_gp_end(rsp, rdp); - - /* Update RCU state based on any recent quiescent states. */ - rcu_check_quiescent_state(rsp, rdp); - - /* Does this CPU require a not-yet-started grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); - rcu_start_gp(rsp, flags); /* releases above lock */ - } - - /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_callbacks(rsp, rdp); -} - -/* - * Do RCU core processing for the current CPU. - */ -static void rcu_process_callbacks(struct softirq_action *unused) -{ - trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); - rcu_preempt_process_callbacks(); - trace_rcu_utilization("End RCU core"); -} - -/* - * Schedule RCU callback invocation. If the specified type of RCU - * does not support RCU priority boosting, just do a direct call, - * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the - * rcu_cpu_kthread_task cannot disappear out from under us. - */ -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) -{ - if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) - return; - if (likely(!rsp->boost)) { - rcu_do_batch(rsp, rdp); - return; - } - invoke_rcu_callbacks_kthread(); -} - -static void invoke_rcu_core(void) -{ - raise_softirq(RCU_SOFTIRQ); -} - -static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - smp_mb(); /* Ensure RCU update seen before callback registry. */ - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ - local_irq_save(flags); - rdp = this_cpu_ptr(rsp->rda); - - /* Add the callback to our list. */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - rdp->qlen++; - - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen); - else - trace_rcu_callback(rsp->name, head, rdp->qlen); - - /* If interrupts were disabled, don't dive into RCU core. */ - if (irqs_disabled_flags(flags)) { - local_irq_restore(flags); - return; - } - - /* - * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() - * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback - * is the only one waiting for a grace period to complete. - */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - - /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); - - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ - } else { - /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; - } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - local_irq_restore(flags); -} - -/* - * Queue an RCU-sched callback for invocation after a grace period. - */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Queue an RCU for invocation after a quicker grace period. - */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_bh_state); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. - */ -void synchronize_sched(void) -{ - if (rcu_blocking_is_gp()) - return; - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - */ -void synchronize_rcu_bh(void) -{ - if (rcu_blocking_is_gp()) - return; - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, for the specified type of RCU, returning 1 if so. - * The checks are in order of increasing expense: checks that can be - * carried out against CPU-local state are performed first. However, - * we must check for CPU stalls first, else we might not get a chance. - */ -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) -{ - struct rcu_node *rnp = rdp->mynode; - - rdp->n_rcu_pending++; - - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rsp, rdp); - - /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { - - /* - * If force_quiescent_state() coming soon and this CPU - * needs a quiescent state, and this is either RCU-sched - * or RCU-bh, force a local reschedule. - */ - rdp->n_rp_qs_pending++; - if (!rdp->preemptible && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, - jiffies)) - set_need_resched(); - } else if (rdp->qs_pending && rdp->passed_quiesce) { - rdp->n_rp_report_qs++; - return 1; - } - - /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { - rdp->n_rp_cb_ready++; - return 1; - } - - /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; - return 1; - } - - /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; - return 1; - } - - /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ - rdp->n_rp_gp_started++; - return 1; - } - - /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (rcu_gp_in_progress(rsp) && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { - rdp->n_rp_need_fqs++; - return 1; - } - - /* nothing to do */ - rdp->n_rp_need_nothing++; - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -static int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. - */ -static int rcu_cpu_has_callbacks(int cpu) -{ - /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_needs_cpu(cpu); -} - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - -static void rcu_barrier_callback(struct rcu_head *notused) -{ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) -{ - BUG_ON(in_interrupt()); - /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); - /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. Note that on_each_cpu() disables irqs, which prevents - * any CPUs from coming online or going offline until each online - * CPU has queued its RCU-barrier callback. - */ - atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); - wait_for_completion(&rcu_barrier_completion); - mutex_unlock(&rcu_barrier_mutex); -} - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(&rcu_bh_state, call_rcu_bh); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(&rcu_sched_state, call_rcu_sched); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -/* - * Do boot-time initialization of a CPU's per-CPU RCU data. - */ -static void __init -rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - int i; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); - rdp->cpu = cpu; - rdp->rsp = rsp; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. - */ -static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptible = preemptible; - rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->blimit = blimit; - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - - /* - * A new grace period might start here. If so, we won't be part - * of it, but that is OK, as we are currently in a quiescent state. - */ - - /* Exclude any attempts to start a new GP on large systems. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - - /* Add CPU to rcu_node bitmasks. */ - rnp = rdp->mynode; - mask = rdp->grpmask; - do { - /* Exclude any attempts to start a new GP on small systems. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit |= mask; - mask = rnp->grpmask; - if (rnp == rdp->mynode) { - /* - * If there is a grace period in progress, we will - * set up to wait for it next time we run the - * RCU core code. - */ - rdp->gpnum = rnp->completed; - rdp->completed = rnp->completed; - rdp->passed_quiesce = 0; - rdp->qs_pending = 0; - rdp->passed_quiesce_gpnum = rnp->gpnum - 1; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); - } - raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ - rnp = rnp->parent; - } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); -} - -static void __cpuinit rcu_prepare_cpu(int cpu) -{ - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); -} - -/* - * Handle CPU online/offline notification events. - */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - - trace_rcu_utilization("Start CPU hotplug"); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_prepare_cpu(cpu); - rcu_prepare_kthreads(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); - break; - case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ - rcu_send_cbs_to_online(&rcu_bh_state); - rcu_send_cbs_to_online(&rcu_sched_state); - rcu_preempt_send_cbs_to_online(); - rcu_cleanup_after_idle(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - trace_rcu_utilization("End CPU hotplug"); - return NOTIFY_OK; -} - -/* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. This function also enables RCU lockdep checking. - */ -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} - -/* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. - */ -#ifdef CONFIG_RCU_FANOUT_EXACT -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int i; - - for (i = NUM_RCU_LVLS - 1; i > 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = RCU_FANOUT_LEAF; -} -#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int ccur; - int cprv; - int i; - - cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } -} -#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ - -/* - * Helper function for rcu_init() that initializes one rcu_state structure. - */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) -{ - static char *buf[] = { "rcu_node_level_0", - "rcu_node_level_1", - "rcu_node_level_2", - "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ - int cpustride = 1; - int i; - int j; - struct rcu_node *rnp; - - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ - - /* Initialize the level-tracking arrays. */ - - for (i = 1; i < NUM_RCU_LVLS; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); - - /* Initialize the elements themselves, starting from the leaves. */ - - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; - rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, - &rcu_node_class[i], buf[i]); - rnp->gpnum = 0; - rnp->qsmask = 0; - rnp->qsmaskinit = 0; - rnp->grplo = j * cpustride; - rnp->grphi = (j + 1) * cpustride - 1; - if (rnp->grphi >= NR_CPUS) - rnp->grphi = NR_CPUS - 1; - if (i == 0) { - rnp->grpnum = 0; - rnp->grpmask = 0; - rnp->parent = NULL; - } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; - rnp->grpmask = 1UL << rnp->grpnum; - rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; - } - rnp->level = i; - INIT_LIST_HEAD(&rnp->blkd_tasks); - } - } - - rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; - for_each_possible_cpu(i) { - while (i > rnp->grphi) - rnp++; - per_cpu_ptr(rsp->rda, i)->mynode = rnp; - rcu_boot_init_percpu_data(i, rsp); - } -} - -void __init rcu_init(void) -{ - int cpu; - - rcu_bootup_announce(); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - cpu_notifier(rcu_cpu_notify, 0); - for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - check_cpu_stall_init(); -} - -#include "rcutree_plugin.h" -/* - * Read-Copy Update tracing for classic implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define RCU_TREE_NONCORE -#include "rcutree.h" - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c", - rdp->qlen, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c/%d ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -#define PRINT_RCU_DATA(name, func, m) \ - do { \ - int _p_r_d_i; \ - \ - for_each_possible_cpu(_p_r_d_i) \ - func(m, &per_cpu(name, _p_r_d_i)); \ - } while (0) - -static int show_rcudata(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); - seq_puts(m, "rcu_bh:\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); - return 0; -} - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata, NULL); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->passed_quiesce_gpnum, - rdp->qs_pending); - seq_printf(m, ",%d,%llx,%d,%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, ",%d,\"%c\"", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu))); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, ",%ld", rdp->blimit); - seq_printf(m, ",%lu,%lu,%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata_csv(struct seq_file *m, void *unused) -{ - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "\"rcu_preempt:\"\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "\"rcu_sched:\"\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); - seq_puts(m, "\"rcu_bh:\"\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); - return 0; -} - -static int rcudata_csv_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata_csv, NULL); -} - -static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " - "j=%04x bt=%04x\n", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts, - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - " balk", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * Create the rcuboost debugfs entry. Standard error return. - */ -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, - &rcu_node_boost_fops); -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return 0; /* There cannot be an error if we didn't create it! */ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, gpnum, rsp->fqs_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff), - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_one_rcu_state(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_one_rcu_state(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_one_rcu_state(m, &rcu_bh_state); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, NULL); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave(&rnp->lock, flags); - completed = rsp->completed; - gpnum = rsp->gpnum; - if (rsp->completed == rsp->gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", - rsp->name, completed, gpnum, gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - show_one_rcugp(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - show_one_rcugp(m, &rcu_sched_state); - show_one_rcugp(m, &rcu_bh_state); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, NULL); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld rpq=%ld cbr=%ld cng=%ld " - "gpc=%ld gps=%ld nf=%ld nn=%ld\n", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending, - rdp->n_rp_qs_pending, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp, - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_need_fqs, - rdp->n_rp_need_nothing); -} - -static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) -{ - int cpu; - struct rcu_data *rdp; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); - } -} - -static int show_rcu_pending(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_rcu_pendings(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_rcu_pendings(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_rcu_pendings(m, &rcu_bh_state); - return 0; -} - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_pending, NULL); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcudata.csv", 0444, rcudir, - NULL, &rcudata_csv_fops); - if (!retval) - goto free_out; - - if (rcu_boost_trace_create_file(rcudir)) - goto free_out; - - retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, rcudir, - NULL, &rcuhier_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, rcudir, - NULL, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); -/* - * Public API and common code for kernel->userspace relay file support. - * - * See Documentation/filesystems/relay.txt for an overview. - * - * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp - * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) - * - * Moved to kernel/relay.c by Paul Mundt, 2006. - * November 2006 - CPU hotplug support by Mathieu Desnoyers - * (mathieu.desnoyers@polymtl.ca) - * - * This file is released under the GPL. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* list of open channels, for cpu hotplug */ -static DEFINE_MUTEX(relay_channels_mutex); -static LIST_HEAD(relay_channels); - -/* - * close() vm_op implementation for relay file mapping. - */ -static void relay_file_mmap_close(struct vm_area_struct *vma) -{ - struct rchan_buf *buf = vma->vm_private_data; - buf->chan->cb->buf_unmapped(buf, vma->vm_file); -} - -/* - * fault() vm_op implementation for relay file mapping. - */ -static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page; - struct rchan_buf *buf = vma->vm_private_data; - pgoff_t pgoff = vmf->pgoff; - - if (!buf) - return VM_FAULT_OOM; - - page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT)); - if (!page) - return VM_FAULT_SIGBUS; - get_page(page); - vmf->page = page; - - return 0; -} - -/* - * vm_ops for relay file mappings. - */ -static const struct vm_operations_struct relay_file_mmap_ops = { - .fault = relay_buf_fault, - .close = relay_file_mmap_close, -}; - -/* - * allocate an array of pointers of struct page - */ -static struct page **relay_alloc_page_array(unsigned int n_pages) -{ - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); -} - -/* - * free an array of pointers of struct page - */ -static void relay_free_page_array(struct page **array) -{ - if (is_vmalloc_addr(array)) - vfree(array); - else - kfree(array); -} - -/** - * relay_mmap_buf: - mmap channel buffer to process address space - * @buf: relay channel buffer - * @vma: vm_area_struct describing memory to be mapped - * - * Returns 0 if ok, negative on error - * - * Caller should already have grabbed mmap_sem. - */ -static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) -{ - unsigned long length = vma->vm_end - vma->vm_start; - struct file *filp = vma->vm_file; - - if (!buf) - return -EBADF; - - if (length != (unsigned long)buf->chan->alloc_size) - return -EINVAL; - - vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_private_data = buf; - buf->chan->cb->buf_mapped(buf, filp); - - return 0; -} - -/** - * relay_alloc_buf - allocate a channel buffer - * @buf: the buffer struct - * @size: total size of the buffer - * - * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The - * passed in size will get page aligned, if it isn't already. - */ -static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) -{ - void *mem; - unsigned int i, j, n_pages; - - *size = PAGE_ALIGN(*size); - n_pages = *size >> PAGE_SHIFT; - - buf->page_array = relay_alloc_page_array(n_pages); - if (!buf->page_array) - return NULL; - - for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); - if (unlikely(!buf->page_array[i])) - goto depopulate; - set_page_private(buf->page_array[i], (unsigned long)buf); - } - mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); - if (!mem) - goto depopulate; - - memset(mem, 0, *size); - buf->page_count = n_pages; - return mem; - -depopulate: - for (j = 0; j < i; j++) - __free_page(buf->page_array[j]); - relay_free_page_array(buf->page_array); - return NULL; -} - -/** - * relay_create_buf - allocate and initialize a channel buffer - * @chan: the relay channel - * - * Returns channel buffer if successful, %NULL otherwise. - */ -static struct rchan_buf *relay_create_buf(struct rchan *chan) -{ - struct rchan_buf *buf; - - if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) - return NULL; - - buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) - return NULL; - buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); - if (!buf->padding) - goto free_buf; - - buf->start = relay_alloc_buf(buf, &chan->alloc_size); - if (!buf->start) - goto free_buf; - - buf->chan = chan; - kref_get(&buf->chan->kref); - return buf; - -free_buf: - kfree(buf->padding); - kfree(buf); - return NULL; -} - -/** - * relay_destroy_channel - free the channel struct - * @kref: target kernel reference that contains the relay channel - * - * Should only be called from kref_put(). - */ -static void relay_destroy_channel(struct kref *kref) -{ - struct rchan *chan = container_of(kref, struct rchan, kref); - kfree(chan); -} - -/** - * relay_destroy_buf - destroy an rchan_buf struct and associated buffer - * @buf: the buffer struct - */ -static void relay_destroy_buf(struct rchan_buf *buf) -{ - struct rchan *chan = buf->chan; - unsigned int i; - - if (likely(buf->start)) { - vunmap(buf->start); - for (i = 0; i < buf->page_count; i++) - __free_page(buf->page_array[i]); - relay_free_page_array(buf->page_array); - } - chan->buf[buf->cpu] = NULL; - kfree(buf->padding); - kfree(buf); - kref_put(&chan->kref, relay_destroy_channel); -} - -/** - * relay_remove_buf - remove a channel buffer - * @kref: target kernel reference that contains the relay buffer - * - * Removes the file from the fileystem, which also frees the - * rchan_buf_struct and the channel buffer. Should only be called from - * kref_put(). - */ -static void relay_remove_buf(struct kref *kref) -{ - struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - buf->chan->cb->remove_buf_file(buf->dentry); - relay_destroy_buf(buf); -} - -/** - * relay_buf_empty - boolean, is the channel buffer empty? - * @buf: channel buffer - * - * Returns 1 if the buffer is empty, 0 otherwise. - */ -static int relay_buf_empty(struct rchan_buf *buf) -{ - return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; -} - -/** - * relay_buf_full - boolean, is the channel buffer full? - * @buf: channel buffer - * - * Returns 1 if the buffer is full, 0 otherwise. - */ -int relay_buf_full(struct rchan_buf *buf) -{ - size_t ready = buf->subbufs_produced - buf->subbufs_consumed; - return (ready >= buf->chan->n_subbufs) ? 1 : 0; -} -EXPORT_SYMBOL_GPL(relay_buf_full); - -/* - * High-level relay kernel API and associated functions. - */ - -/* - * rchan_callback implementations defining default channel behavior. Used - * in place of corresponding NULL values in client callback struct. - */ - -/* - * subbuf_start() default callback. Does nothing. - */ -static int subbuf_start_default_callback (struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - size_t prev_padding) -{ - if (relay_buf_full(buf)) - return 0; - - return 1; -} - -/* - * buf_mapped() default callback. Does nothing. - */ -static void buf_mapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - -/* - * buf_unmapped() default callback. Does nothing. - */ -static void buf_unmapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - -/* - * create_buf_file_create() default callback. Does nothing. - */ -static struct dentry *create_buf_file_default_callback(const char *filename, - struct dentry *parent, - umode_t mode, - struct rchan_buf *buf, - int *is_global) -{ - return NULL; -} - -/* - * remove_buf_file() default callback. Does nothing. - */ -static int remove_buf_file_default_callback(struct dentry *dentry) -{ - return -EINVAL; -} - -/* relay channel default callbacks */ -static struct rchan_callbacks default_channel_callbacks = { - .subbuf_start = subbuf_start_default_callback, - .buf_mapped = buf_mapped_default_callback, - .buf_unmapped = buf_unmapped_default_callback, - .create_buf_file = create_buf_file_default_callback, - .remove_buf_file = remove_buf_file_default_callback, -}; - -/** - * wakeup_readers - wake up readers waiting on a channel - * @data: contains the channel buffer - * - * This is the timer function used to defer reader waking. - */ -static void wakeup_readers(unsigned long data) -{ - struct rchan_buf *buf = (struct rchan_buf *)data; - wake_up_interruptible(&buf->read_wait); -} - -/** - * __relay_reset - reset a channel buffer - * @buf: the channel buffer - * @init: 1 if this is a first-time initialization - * - * See relay_reset() for description of effect. - */ -static void __relay_reset(struct rchan_buf *buf, unsigned int init) -{ - size_t i; - - if (init) { - init_waitqueue_head(&buf->read_wait); - kref_init(&buf->kref); - setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - } else - del_timer_sync(&buf->timer); - - buf->subbufs_produced = 0; - buf->subbufs_consumed = 0; - buf->bytes_consumed = 0; - buf->finalized = 0; - buf->data = buf->start; - buf->offset = 0; - - for (i = 0; i < buf->chan->n_subbufs; i++) - buf->padding[i] = 0; - - buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); -} - -/** - * relay_reset - reset the channel - * @chan: the channel - * - * This has the effect of erasing all data from all channel buffers - * and restarting the channel in its initial state. The buffers - * are not freed, so any mappings are still in effect. - * - * NOTE. Care should be taken that the channel isn't actually - * being used by anything when this call is made. - */ -void relay_reset(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); - return; - } - - mutex_lock(&relay_channels_mutex); - for_each_possible_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_reset); - -static inline void relay_set_buf_dentry(struct rchan_buf *buf, - struct dentry *dentry) -{ - buf->dentry = dentry; - buf->dentry->d_inode->i_size = buf->early_bytes; -} - -static struct dentry *relay_create_buf_file(struct rchan *chan, - struct rchan_buf *buf, - unsigned int cpu) -{ - struct dentry *dentry; - char *tmpname; - - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); - - /* Create file in fs */ - dentry = chan->cb->create_buf_file(tmpname, chan->parent, - S_IRUSR, buf, - &chan->is_global); - - kfree(tmpname); - - return dentry; -} - -/* - * relay_open_buf - create a new relay channel buffer - * - * used by relay_open() and CPU hotplug. - */ -static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) -{ - struct rchan_buf *buf = NULL; - struct dentry *dentry; - - if (chan->is_global) - return chan->buf[0]; - - buf = relay_create_buf(chan); - if (!buf) - return NULL; - - if (chan->has_base_filename) { - dentry = relay_create_buf_file(chan, buf, cpu); - if (!dentry) - goto free_buf; - relay_set_buf_dentry(buf, dentry); - } - - buf->cpu = cpu; - __relay_reset(buf, 1); - - if(chan->is_global) { - chan->buf[0] = buf; - buf->cpu = 0; - } - - return buf; - -free_buf: - relay_destroy_buf(buf); - return NULL; -} - -/** - * relay_close_buf - close a channel buffer - * @buf: channel buffer - * - * Marks the buffer finalized and restores the default callbacks. - * The channel buffer and channel buffer data structure are then freed - * automatically when the last reference is given up. - */ -static void relay_close_buf(struct rchan_buf *buf) -{ - buf->finalized = 1; - del_timer_sync(&buf->timer); - kref_put(&buf->kref, relay_remove_buf); -} - -static void setup_callbacks(struct rchan *chan, - struct rchan_callbacks *cb) -{ - if (!cb) { - chan->cb = &default_channel_callbacks; - return; - } - - if (!cb->subbuf_start) - cb->subbuf_start = subbuf_start_default_callback; - if (!cb->buf_mapped) - cb->buf_mapped = buf_mapped_default_callback; - if (!cb->buf_unmapped) - cb->buf_unmapped = buf_unmapped_default_callback; - if (!cb->create_buf_file) - cb->create_buf_file = create_buf_file_default_callback; - if (!cb->remove_buf_file) - cb->remove_buf_file = remove_buf_file_default_callback; - chan->cb = cb; -} - -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) -{ - unsigned int hotcpu = (unsigned long)hcpu; - struct rchan *chan; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) - continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return notifier_from_errno(-ENOMEM); - } - } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; - } - return NOTIFY_OK; -} - -/** - * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only - * @parent: dentry of parent directory, %NULL for root directory or buffer - * @subbuf_size: size of sub-buffers - * @n_subbufs: number of sub-buffers - * @cb: client callback functions - * @private_data: user-defined data - * - * Returns channel pointer if successful, %NULL otherwise. - * - * Creates a channel buffer for each cpu using the sizes and - * attributes specified. The created channel buffer files - * will be named base_filename0...base_filenameN-1. File - * permissions will be %S_IRUSR. - */ -struct rchan *relay_open(const char *base_filename, - struct dentry *parent, - size_t subbuf_size, - size_t n_subbufs, - struct rchan_callbacks *cb, - void *private_data) -{ - unsigned int i; - struct rchan *chan; - - if (!(subbuf_size && n_subbufs)) - return NULL; - if (subbuf_size > UINT_MAX / n_subbufs) - return NULL; - - chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); - if (!chan) - return NULL; - - chan->version = RELAYFS_CHANNEL_VERSION; - chan->n_subbufs = n_subbufs; - chan->subbuf_size = subbuf_size; - chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); - chan->parent = parent; - chan->private_data = private_data; - if (base_filename) { - chan->has_base_filename = 1; - strlcpy(chan->base_filename, base_filename, NAME_MAX); - } - setup_callbacks(chan, cb); - kref_init(&chan->kref); - - mutex_lock(&relay_channels_mutex); - for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) - goto free_bufs; - } - list_add(&chan->list, &relay_channels); - mutex_unlock(&relay_channels_mutex); - - return chan; - -free_bufs: - for_each_possible_cpu(i) { - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); - } - - kref_put(&chan->kref, relay_destroy_channel); - mutex_unlock(&relay_channels_mutex); - return NULL; -} -EXPORT_SYMBOL_GPL(relay_open); - -struct rchan_percpu_buf_dispatcher { - struct rchan_buf *buf; - struct dentry *dentry; -}; - -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel. - * Useful to do early tracing in kernel, before VFS is up, for example. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strlcpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, chan->buf[i], i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); - local_irq_restore(flags); - } else { - disp.buf = chan->buf[i]; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} - -/** - * relay_switch_subbuf - switch to a new sub-buffer - * @buf: channel buffer - * @length: size of current event - * - * Returns either the length passed in or 0 if full. - * - * Performs sub-buffer-switch tasks such as invoking callbacks, - * updating padding counts, waking up readers, etc. - */ -size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) -{ - void *old, *new; - size_t old_subbuf, new_subbuf; - - if (unlikely(length > buf->chan->subbuf_size)) - goto toobig; - - if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; - old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; - buf->subbufs_produced++; - if (buf->dentry) - buf->dentry->d_inode->i_size += - buf->chan->subbuf_size - - buf->padding[old_subbuf]; - else - buf->early_bytes += buf->chan->subbuf_size - - buf->padding[old_subbuf]; - smp_mb(); - if (waitqueue_active(&buf->read_wait)) - /* - * Calling wake_up_interruptible() from here - * will deadlock if we happen to be logging - * from the scheduler (trying to re-grab - * rq->lock), so defer it. - */ - mod_timer(&buf->timer, jiffies + 1); - } - - old = buf->data; - new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - new = buf->start + new_subbuf * buf->chan->subbuf_size; - buf->offset = 0; - if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { - buf->offset = buf->chan->subbuf_size + 1; - return 0; - } - buf->data = new; - buf->padding[new_subbuf] = 0; - - if (unlikely(length + buf->offset > buf->chan->subbuf_size)) - goto toobig; - - return length; - -toobig: - buf->chan->last_toobig = length; - return 0; -} -EXPORT_SYMBOL_GPL(relay_switch_subbuf); - -/** - * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count - * @chan: the channel - * @cpu: the cpu associated with the channel buffer to update - * @subbufs_consumed: number of sub-buffers to add to current buf's count - * - * Adds to the channel buffer's consumed sub-buffer count. - * subbufs_consumed should be the number of sub-buffers newly consumed, - * not the total consumed. - * - * NOTE. Kernel clients don't need to call this function if the channel - * mode is 'overwrite'. - */ -void relay_subbufs_consumed(struct rchan *chan, - unsigned int cpu, - size_t subbufs_consumed) -{ - struct rchan_buf *buf; - - if (!chan) - return; - - if (cpu >= NR_CPUS || !chan->buf[cpu] || - subbufs_consumed > chan->n_subbufs) - return; - - buf = chan->buf[cpu]; - if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) - buf->subbufs_consumed = buf->subbufs_produced; - else - buf->subbufs_consumed += subbufs_consumed; -} -EXPORT_SYMBOL_GPL(relay_subbufs_consumed); - -/** - * relay_close - close the channel - * @chan: the channel - * - * Closes all channel buffers and frees the channel. - */ -void relay_close(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); - else - for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); - - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%Zd) > sub-buffer size (%Zd)]\n", - chan->last_toobig, chan->subbuf_size); - - list_del(&chan->list); - kref_put(&chan->kref, relay_destroy_channel); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_close); - -/** - * relay_flush - close the channel - * @chan: the channel - * - * Flushes all channel buffers, i.e. forces buffer switch. - */ -void relay_flush(struct rchan *chan) -{ - unsigned int i; - - if (!chan) - return; - - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); - return; - } - - mutex_lock(&relay_channels_mutex); - for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); - mutex_unlock(&relay_channels_mutex); -} -EXPORT_SYMBOL_GPL(relay_flush); - -/** - * relay_file_open - open file op for relay files - * @inode: the inode - * @filp: the file - * - * Increments the channel buffer refcount. - */ -static int relay_file_open(struct inode *inode, struct file *filp) -{ - struct rchan_buf *buf = inode->i_private; - kref_get(&buf->kref); - filp->private_data = buf; - - return nonseekable_open(inode, filp); -} - -/** - * relay_file_mmap - mmap file op for relay files - * @filp: the file - * @vma: the vma describing what to map - * - * Calls upon relay_mmap_buf() to map the file into user space. - */ -static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct rchan_buf *buf = filp->private_data; - return relay_mmap_buf(buf, vma); -} - -/** - * relay_file_poll - poll file op for relay files - * @filp: the file - * @wait: poll table - * - * Poll implemention. - */ -static unsigned int relay_file_poll(struct file *filp, poll_table *wait) -{ - unsigned int mask = 0; - struct rchan_buf *buf = filp->private_data; - - if (buf->finalized) - return POLLERR; - - if (filp->f_mode & FMODE_READ) { - poll_wait(filp, &buf->read_wait, wait); - if (!relay_buf_empty(buf)) - mask |= POLLIN | POLLRDNORM; - } - - return mask; -} - -/** - * relay_file_release - release file op for relay files - * @inode: the inode - * @filp: the file - * - * Decrements the channel refcount, as the filesystem is - * no longer using it. - */ -static int relay_file_release(struct inode *inode, struct file *filp) -{ - struct rchan_buf *buf = filp->private_data; - kref_put(&buf->kref, relay_remove_buf); - - return 0; -} - -/* - * relay_file_read_consume - update the consumed count for the buffer - */ -static void relay_file_read_consume(struct rchan_buf *buf, - size_t read_pos, - size_t bytes_consumed) -{ - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t read_subbuf; - - if (buf->subbufs_produced == buf->subbufs_consumed && - buf->offset == buf->bytes_consumed) - return; - - if (buf->bytes_consumed + bytes_consumed > subbuf_size) { - relay_subbufs_consumed(buf->chan, buf->cpu, 1); - buf->bytes_consumed = 0; - } - - buf->bytes_consumed += bytes_consumed; - if (!read_pos) - read_subbuf = buf->subbufs_consumed % n_subbufs; - else - read_subbuf = read_pos / buf->chan->subbuf_size; - if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { - if ((read_subbuf == buf->subbufs_produced % n_subbufs) && - (buf->offset == subbuf_size)) - return; - relay_subbufs_consumed(buf->chan, buf->cpu, 1); - buf->bytes_consumed = 0; - } -} - -/* - * relay_file_read_avail - boolean, are there unconsumed bytes available? - */ -static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) -{ - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; - - relay_file_read_consume(buf, read_pos, 0); - - consumed = buf->subbufs_consumed; - - if (unlikely(buf->offset > subbuf_size)) { - if (produced == consumed) - return 0; - return 1; - } - - if (unlikely(produced - consumed >= n_subbufs)) { - consumed = produced - n_subbufs + 1; - buf->subbufs_consumed = consumed; - buf->bytes_consumed = 0; - } - - produced = (produced % n_subbufs) * subbuf_size + buf->offset; - consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; - - if (consumed > produced) - produced += n_subbufs * subbuf_size; - - if (consumed == produced) { - if (buf->offset == subbuf_size && - buf->subbufs_produced > buf->subbufs_consumed) - return 1; - return 0; - } - - return 1; -} - -/** - * relay_file_read_subbuf_avail - return bytes available in sub-buffer - * @read_pos: file read position - * @buf: relay channel buffer - */ -static size_t relay_file_read_subbuf_avail(size_t read_pos, - struct rchan_buf *buf) -{ - size_t padding, avail = 0; - size_t read_subbuf, read_offset, write_subbuf, write_offset; - size_t subbuf_size = buf->chan->subbuf_size; - - write_subbuf = (buf->data - buf->start) / subbuf_size; - write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; - read_subbuf = read_pos / subbuf_size; - read_offset = read_pos % subbuf_size; - padding = buf->padding[read_subbuf]; - - if (read_subbuf == write_subbuf) { - if (read_offset + padding < write_offset) - avail = write_offset - (read_offset + padding); - } else - avail = (subbuf_size - padding) - read_offset; - - return avail; -} - -/** - * relay_file_read_start_pos - find the first available byte to read - * @read_pos: file read position - * @buf: relay channel buffer - * - * If the @read_pos is in the middle of padding, return the - * position of the first actually available byte, otherwise - * return the original value. - */ -static size_t relay_file_read_start_pos(size_t read_pos, - struct rchan_buf *buf) -{ - size_t read_subbuf, padding, padding_start, padding_end; - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - size_t consumed = buf->subbufs_consumed % n_subbufs; - - if (!read_pos) - read_pos = consumed * subbuf_size + buf->bytes_consumed; - read_subbuf = read_pos / subbuf_size; - padding = buf->padding[read_subbuf]; - padding_start = (read_subbuf + 1) * subbuf_size - padding; - padding_end = (read_subbuf + 1) * subbuf_size; - if (read_pos >= padding_start && read_pos < padding_end) { - read_subbuf = (read_subbuf + 1) % n_subbufs; - read_pos = read_subbuf * subbuf_size; - } - - return read_pos; -} - -/** - * relay_file_read_end_pos - return the new read position - * @read_pos: file read position - * @buf: relay channel buffer - * @count: number of bytes to be read - */ -static size_t relay_file_read_end_pos(struct rchan_buf *buf, - size_t read_pos, - size_t count) -{ - size_t read_subbuf, padding, end_pos; - size_t subbuf_size = buf->chan->subbuf_size; - size_t n_subbufs = buf->chan->n_subbufs; - - read_subbuf = read_pos / subbuf_size; - padding = buf->padding[read_subbuf]; - if (read_pos % subbuf_size + count + padding == subbuf_size) - end_pos = (read_subbuf + 1) * subbuf_size; - else - end_pos = read_pos + count; - if (end_pos >= subbuf_size * n_subbufs) - end_pos = 0; - - return end_pos; -} - -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc, - read_actor_t actor) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc, - read_actor_t actor); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_actor_t actor, - read_descriptor_t *desc) -{ - struct rchan_buf *buf = filp->private_data; - size_t read_start, avail; - int ret; - - if (!desc->count) - return 0; - - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); - do { - if (!relay_file_read_avail(buf, *ppos)) - break; - - read_start = relay_file_read_start_pos(*ppos, buf); - avail = relay_file_read_subbuf_avail(read_start, buf); - if (!avail) - break; - - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc, actor); - if (desc->error < 0) - break; - - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); - - return desc->written; -} - -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, - NULL, &desc); -} - -static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -{ - rbuf->bytes_consumed += bytes_consumed; - - if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { - relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); - rbuf->bytes_consumed %= rbuf->chan->subbuf_size; - } -} - -static void relay_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct rchan_buf *rbuf; - - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); -} - -static const struct pipe_buf_operations relay_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - -/* - * subbuf_splice_actor - splice up to one subbuf's worth of data - */ -static ssize_t subbuf_splice_actor(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags, - int *nonpad_ret) -{ - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; - struct rchan_buf *rbuf = in->private_data; - unsigned int subbuf_size = rbuf->chan->subbuf_size; - uint64_t pos = (uint64_t) *ppos; - uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; - size_t read_start = (size_t) do_div(pos, alloc_size); - size_t read_subbuf = read_start / subbuf_size; - size_t padding = rbuf->padding[read_subbuf]; - size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .partial = partial, - .flags = flags, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - ssize_t ret; - - if (rbuf->subbufs_produced == rbuf->subbufs_consumed) - return 0; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - /* - * Adjust read len, if longer than what is available - */ - if (len > (subbuf_size - read_start % subbuf_size)) - len = subbuf_size - read_start % subbuf_size; - - subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; - pidx = (read_start / PAGE_SIZE) % subbuf_pages; - poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); - - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; - unsigned int cur_pos = read_start + total_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; - - this_end = cur_pos + this_len; - if (this_end >= nonpad_end) { - this_len = nonpad_end - cur_pos; - private = this_len + padding; - } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; - - len -= this_len; - total_len += this_len; - poff = 0; - pidx = (pidx + 1) % subbuf_pages; - - if (this_end >= nonpad_end) { - spd.nr_pages++; - break; - } - } - - ret = 0; - if (!spd.nr_pages) - goto out; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - goto out; - - if (read_start + ret == nonpad_end) - ret += padding; - -out: - splice_shrink_spd(pipe, &spd); - return ret; -} - -static ssize_t relay_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - ssize_t spliced; - int ret; - int nonpad_ret = 0; - - ret = 0; - spliced = 0; - - while (len && !spliced) { - ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); - if (ret < 0) - break; - else if (!ret) { - if (flags & SPLICE_F_NONBLOCK) - ret = -EAGAIN; - break; - } - - *ppos += ret; - if (ret > len) - len = 0; - else - len -= ret; - spliced += nonpad_ret; - nonpad_ret = 0; - } - - if (spliced) - return spliced; - - return ret; -} - -const struct file_operations relay_file_operations = { - .open = relay_file_open, - .poll = relay_file_poll, - .mmap = relay_file_mmap, - .read = relay_file_read, - .llseek = no_llseek, - .release = relay_file_release, - .splice_read = relay_file_splice_read, -}; -EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov - * - */ - -#include -#include -#include -#include -#include -#include - -void res_counter_init(struct res_counter *counter, struct res_counter *parent) -{ - spin_lock_init(&counter->lock); - counter->limit = RESOURCE_MAX; - counter->soft_limit = RESOURCE_MAX; - counter->parent = parent; -} - -int res_counter_charge_locked(struct res_counter *counter, unsigned long val) -{ - if (counter->usage + val > counter->limit) { - counter->failcnt++; - return -ENOMEM; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return 0; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - int ret; - unsigned long flags; - struct res_counter *c, *u; - - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - ret = res_counter_charge_locked(c, val); - spin_unlock(&c->lock); - if (ret < 0) { - *limit_fail_at = c; - goto undo; - } - } - ret = 0; - goto done; -undo: - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); - } -done: - local_irq_restore(flags); - return ret; -} - -int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - int ret, r; - unsigned long flags; - struct res_counter *c; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val); - if (r) - c->usage += val; - spin_unlock(&c->lock); - if (r < 0 && ret == 0) { - *limit_fail_at = c; - ret = r; - } - } - local_irq_restore(flags); - - return ret; -} -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; -} - -void res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - unsigned long flags; - struct res_counter *c; - - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); - spin_unlock(&c->lock); - } - local_irq_restore(flags); -} - - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -#if BITS_PER_LONG == 32 -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&counter->lock, flags); - ret = *res_counter_member(counter, member); - spin_unlock_irqrestore(&counter->lock, flags); - - return ret; -} -#else -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} -#endif - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res) -{ - char *end; - - /* return RESOURCE_MAX(unlimited) if "-1" is specified */ - if (*buf == '-') { - *res = simple_strtoull(buf + 1, &end, 10); - if (*res != 1 || *end != '\0') - return -EINVAL; - *res = RESOURCE_MAX; - return 0; - } - - *res = memparse(buf, &end); - if (*end != '\0') - return -EINVAL; - - *res = PAGE_ALIGN(*res); - return 0; -} - -int res_counter_write(struct res_counter *counter, int member, - const char *buf, write_strategy_fn write_strategy) -{ - char *end; - unsigned long flags; - unsigned long long tmp, *val; - - if (write_strategy) { - if (write_strategy(buf, &tmp)) - return -EINVAL; - } else { - tmp = simple_strtoull(buf, &end, 10); - if (*end != '\0') - return -EINVAL; - } - spin_lock_irqsave(&counter->lock, flags); - val = res_counter_member(counter, member); - *val = tmp; - spin_unlock_irqrestore(&counter->lock, flags); - return 0; -} -/* - * linux/kernel/resource.c - * - * Copyright (C) 1999 Linus Torvalds - * Copyright (C) 1999 Martin Mares - * - * Arbitrary resource management. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -struct resource ioport_resource = { - .name = "PCI IO", - .start = 0, - .end = IO_SPACE_LIMIT, - .flags = IORESOURCE_IO, -}; -EXPORT_SYMBOL(ioport_resource); - -struct resource iomem_resource = { - .name = "PCI mem", - .start = 0, - .end = -1, - .flags = IORESOURCE_MEM, -}; -EXPORT_SYMBOL(iomem_resource); - -/* constraints to be met while allocating resources */ -struct resource_constraint { - resource_size_t min, max, align; - resource_size_t (*alignf)(void *, const struct resource *, - resource_size_t, resource_size_t); - void *alignf_data; -}; - -static DEFINE_RWLOCK(resource_lock); - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct resource *p = v; - (*pos)++; - if (p->child) - return p->child; - while (!p->sibling && p->parent) - p = p->parent; - return p->sibling; -} - -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - -static void *r_start(struct seq_file *m, loff_t *pos) - __acquires(resource_lock) -{ - struct resource *p = m->private; - loff_t l = 0; - read_lock(&resource_lock); - for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) - ; - return p; -} - -static void r_stop(struct seq_file *m, void *v) - __releases(resource_lock) -{ - read_unlock(&resource_lock); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct resource *root = m->private; - struct resource *r = v, *p; - int width = root->end < 0x10000 ? 4 : 8; - int depth; - - for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) - if (p->parent == root) - break; - seq_printf(m, "%*s%0*llx-%0*llx : %s\n", - depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, - r->name ? r->name : ""); - return 0; -} - -static const struct seq_operations resource_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init ioresources_init(void) -{ - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); - return 0; -} -__initcall(ioresources_init); - -#endif /* CONFIG_PROC_FS */ - -/* Return the conflict entry if you can't request it */ -static struct resource * __request_resource(struct resource *root, struct resource *new) -{ - resource_size_t start = new->start; - resource_size_t end = new->end; - struct resource *tmp, **p; - - if (end < start) - return root; - if (start < root->start) - return root; - if (end > root->end) - return root; - p = &root->child; - for (;;) { - tmp = *p; - if (!tmp || tmp->start > end) { - new->sibling = tmp; - *p = new; - new->parent = root; - return NULL; - } - p = &tmp->sibling; - if (tmp->end < start) - continue; - return tmp; - } -} - -static int __release_resource(struct resource *old) -{ - struct resource *tmp, **p; - - p = &old->parent->child; - for (;;) { - tmp = *p; - if (!tmp) - break; - if (tmp == old) { - *p = tmp->sibling; - old->parent = NULL; - return 0; - } - p = &tmp->sibling; - } - return -EINVAL; -} - -static void __release_child_resources(struct resource *r) -{ - struct resource *tmp, *p; - resource_size_t size; - - p = r->child; - r->child = NULL; - while (p) { - tmp = p; - p = p->sibling; - - tmp->parent = NULL; - tmp->sibling = NULL; - __release_child_resources(tmp); - - printk(KERN_DEBUG "release child resource %pR\n", tmp); - /* need to restore size, and keep flags */ - size = resource_size(tmp); - tmp->start = 0; - tmp->end = size - 1; - } -} - -void release_child_resources(struct resource *r) -{ - write_lock(&resource_lock); - __release_child_resources(r); - write_unlock(&resource_lock); -} - -/** - * request_resource_conflict - request and reserve an I/O or memory resource - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * - * Returns 0 for success, conflict resource on error. - */ -struct resource *request_resource_conflict(struct resource *root, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __request_resource(root, new); - write_unlock(&resource_lock); - return conflict; -} - -/** - * request_resource - request and reserve an I/O or memory resource - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * - * Returns 0 for success, negative error code on error. - */ -int request_resource(struct resource *root, struct resource *new) -{ - struct resource *conflict; - - conflict = request_resource_conflict(root, new); - return conflict ? -EBUSY : 0; -} - -EXPORT_SYMBOL(request_resource); - -/** - * release_resource - release a previously reserved resource - * @old: resource pointer - */ -int release_resource(struct resource *old) -{ - int retval; - - write_lock(&resource_lock); - retval = __release_resource(old); - write_unlock(&resource_lock); - return retval; -} - -EXPORT_SYMBOL(release_resource); - -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) -/* - * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags and "name". - * If found, returns 0, res is overwritten, if not found, returns -1. - */ -static int find_next_system_ram(struct resource *res, char *name) -{ - resource_size_t start, end; - struct resource *p; - - BUG_ON(!res); - - start = res->start; - end = res->end; - BUG_ON(start >= end); - - read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ - if (p->flags != res->flags) - continue; - if (name && strcmp(p->name, name)) - continue; - if (p->start > end) { - p = NULL; - break; - } - if ((p->end >= start) && (p->start < end)) - break; - } - read_unlock(&resource_lock); - if (!p) - return -1; - /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; - return 0; -} - -/* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". - */ -int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) -{ - struct resource res; - unsigned long pfn, end_pfn; - u64 orig_end; - int ret = -1; - - res.start = (u64) start_pfn << PAGE_SHIFT; - res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; - if (end_pfn > pfn) - ret = (*func)(pfn, end_pfn - pfn, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; -} - -#endif - -static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) -{ - return 1; -} -/* - * This generic page_is_ram() returns true if specified address is - * registered as "System RAM" in iomem_resource list. - */ -int __weak page_is_ram(unsigned long pfn) -{ - return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; -} - -void __weak arch_remove_reservations(struct resource *avail) -{ -} - -static resource_size_t simple_align_resource(void *data, - const struct resource *avail, - resource_size_t size, - resource_size_t align) -{ - return avail->start; -} - -static void resource_clip(struct resource *res, resource_size_t min, - resource_size_t max) -{ - if (res->start < min) - res->start = min; - if (res->end > max) - res->end = max; -} - -static bool resource_contains(struct resource *res1, struct resource *res2) -{ - return res1->start <= res2->start && res1->end >= res2->end; -} - -/* - * Find empty slot in the resource tree with the given range and - * alignment constraints - */ -static int __find_resource(struct resource *root, struct resource *old, - struct resource *new, - resource_size_t size, - struct resource_constraint *constraint) -{ - struct resource *this = root->child; - struct resource tmp = *new, avail, alloc; - - tmp.flags = new->flags; - tmp.start = root->start; - /* - * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to tmp->end below would cause an underflow. - */ - if (this && this->start == root->start) { - tmp.start = (this == old) ? old->start : this->end + 1; - this = this->sibling; - } - for(;;) { - if (this) - tmp.end = (this == old) ? this->end : this->start - 1; - else - tmp.end = root->end; - - if (tmp.end < tmp.start) - goto next; - - resource_clip(&tmp, constraint->min, constraint->max); - arch_remove_reservations(&tmp); - - /* Check for overflow after ALIGN() */ - avail = *new; - avail.start = ALIGN(tmp.start, constraint->align); - avail.end = tmp.end; - if (avail.start >= tmp.start) { - alloc.start = constraint->alignf(constraint->alignf_data, &avail, - size, constraint->align); - alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { - new->start = alloc.start; - new->end = alloc.end; - return 0; - } - } - -next: if (!this || this->end == root->end) - break; - - if (this != old) - tmp.start = this->end + 1; - this = this->sibling; - } - return -EBUSY; -} - -/* - * Find empty slot in the resource tree given range and alignment. - */ -static int find_resource(struct resource *root, struct resource *new, - resource_size_t size, - struct resource_constraint *constraint) -{ - return __find_resource(root, NULL, new, size, constraint); -} - -/** - * reallocate_resource - allocate a slot in the resource tree given range & alignment. - * The resource will be relocated if the new size cannot be reallocated in the - * current location. - * - * @root: root resource descriptor - * @old: resource descriptor desired by caller - * @newsize: new size of the resource descriptor - * @constraint: the size and alignment constraints to be met. - */ -int reallocate_resource(struct resource *root, struct resource *old, - resource_size_t newsize, - struct resource_constraint *constraint) -{ - int err=0; - struct resource new = *old; - struct resource *conflict; - - write_lock(&resource_lock); - - if ((err = __find_resource(root, old, &new, newsize, constraint))) - goto out; - - if (resource_contains(&new, old)) { - old->start = new.start; - old->end = new.end; - goto out; - } - - if (old->child) { - err = -EBUSY; - goto out; - } - - if (resource_contains(old, &new)) { - old->start = new.start; - old->end = new.end; - } else { - __release_resource(old); - *old = new; - conflict = __request_resource(root, old); - BUG_ON(conflict); - } -out: - write_unlock(&resource_lock); - return err; -} - - -/** - * allocate_resource - allocate empty slot in the resource tree given range & alignment. - * The resource will be reallocated with a new size if it was already allocated - * @root: root resource descriptor - * @new: resource descriptor desired by caller - * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate - * @align: alignment requested, in bytes - * @alignf: alignment function, optional, called if not NULL - * @alignf_data: arbitrary data to pass to the @alignf function - */ -int allocate_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), - void *alignf_data) -{ - int err; - struct resource_constraint constraint; - - if (!alignf) - alignf = simple_align_resource; - - constraint.min = min; - constraint.max = max; - constraint.align = align; - constraint.alignf = alignf; - constraint.alignf_data = alignf_data; - - if ( new->parent ) { - /* resource is already allocated, try reallocating with - the new constraints */ - return reallocate_resource(root, new, size, &constraint); - } - - write_lock(&resource_lock); - err = find_resource(root, new, size, &constraint); - if (err >= 0 && __request_resource(root, new)) - err = -EBUSY; - write_unlock(&resource_lock); - return err; -} - -EXPORT_SYMBOL(allocate_resource); - -/** - * lookup_resource - find an existing resource by a resource start address - * @root: root resource descriptor - * @start: resource start address - * - * Returns a pointer to the resource if found, NULL otherwise - */ -struct resource *lookup_resource(struct resource *root, resource_size_t start) -{ - struct resource *res; - - read_lock(&resource_lock); - for (res = root->child; res; res = res->sibling) { - if (res->start == start) - break; - } - read_unlock(&resource_lock); - - return res; -} - -/* - * Insert a resource into the resource tree. If successful, return NULL, - * otherwise return the conflicting resource (compare to __request_resource()) - */ -static struct resource * __insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *first, *next; - - for (;; parent = first) { - first = __request_resource(parent, new); - if (!first) - return first; - - if (first == parent) - return first; - if (WARN_ON(first == new)) /* duplicated insertion */ - return first; - - if ((first->start > new->start) || (first->end < new->end)) - break; - if ((first->start == new->start) && (first->end == new->end)) - break; - } - - for (next = first; ; next = next->sibling) { - /* Partial overlap? Bad, and unfixable */ - if (next->start < new->start || next->end > new->end) - return next; - if (!next->sibling) - break; - if (next->sibling->start > new->end) - break; - } - - new->parent = parent; - new->sibling = next->sibling; - new->child = first; - - next->sibling = NULL; - for (next = first; next; next = next->sibling) - next->parent = new; - - if (parent->child == first) { - parent->child = new; - } else { - next = parent->child; - while (next->sibling != first) - next = next->sibling; - next->sibling = new; - } - return NULL; -} - -/** - * insert_resource_conflict - Inserts resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, conflict resource if the resource can't be inserted. - * - * This function is equivalent to request_resource_conflict when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. - */ -struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) -{ - struct resource *conflict; - - write_lock(&resource_lock); - conflict = __insert_resource(parent, new); - write_unlock(&resource_lock); - return conflict; -} - -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - */ -int insert_resource(struct resource *parent, struct resource *new) -{ - struct resource *conflict; - - conflict = insert_resource_conflict(parent, new); - return conflict ? -EBUSY : 0; -} - -/** - * insert_resource_expand_to_fit - Insert a resource into the resource tree - * @root: root resource descriptor - * @new: new resource to insert - * - * Insert a resource into the resource tree, possibly expanding it in order - * to make it encompass any conflicting resources. - */ -void insert_resource_expand_to_fit(struct resource *root, struct resource *new) -{ - if (new->parent) - return; - - write_lock(&resource_lock); - for (;;) { - struct resource *conflict; - - conflict = __insert_resource(root, new); - if (!conflict) - break; - if (conflict == root) - break; - - /* Ok, expand resource to cover the conflict, then try again .. */ - if (conflict->start < new->start) - new->start = conflict->start; - if (conflict->end > new->end) - new->end = conflict->end; - - printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); - } - write_unlock(&resource_lock); -} - -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) -{ - struct resource *tmp, *parent = res->parent; - resource_size_t end = start + size - 1; - int result = -EBUSY; - - write_lock(&resource_lock); - - if ((start < parent->start) || (end > parent->end)) - goto out; - - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - - if (res->sibling && (res->sibling->start <= end)) - goto out; - - tmp = parent->child; - if (tmp != res) { - while (tmp->sibling != res) - tmp = tmp->sibling; - if (start <= tmp->end) - goto out; - } - - res->start = start; - res->end = end; - result = 0; - - out: - write_unlock(&resource_lock); - return result; -} - -static void __init __reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) -{ - struct resource *parent = root; - struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); - - if (!res) - return; - - res->name = name; - res->start = start; - res->end = end; - res->flags = IORESOURCE_BUSY; - - conflict = __request_resource(parent, res); - if (!conflict) - return; - - /* failed, split and try again */ - kfree(res); - - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; - - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); -} - -void __init reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) -{ - write_lock(&resource_lock); - __reserve_region_with_split(root, start, end, name); - write_unlock(&resource_lock); -} - -EXPORT_SYMBOL(adjust_resource); - -/** - * resource_alignment - calculate resource's alignment - * @res: resource pointer - * - * Returns alignment on success, 0 (invalid alignment) on failure. - */ -resource_size_t resource_alignment(struct resource *res) -{ - switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { - case IORESOURCE_SIZEALIGN: - return resource_size(res); - case IORESOURCE_STARTALIGN: - return res->start; - default: - return 0; - } -} - -/* - * This is compatibility stuff for IO resources. - * - * Note how this, unlike the above, knows about - * the IO flag meanings (busy etc). - * - * request_region creates a new busy region. - * - * check_region returns non-zero if the area is already busy. - * - * release_region releases a matching busy region. - */ - -static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); - -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, - resource_size_t start, resource_size_t n, - const char *name, int flags) -{ - DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - - if (!res) - return NULL; - - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - res->flags |= flags; - - write_lock(&resource_lock); - - for (;;) { - struct resource *conflict; - - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } - if (conflict->flags & flags & IORESOURCE_MUXED) { - add_wait_queue(&muxed_resource_wait, &wait); - write_unlock(&resource_lock); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - remove_wait_queue(&muxed_resource_wait, &wait); - write_lock(&resource_lock); - continue; - } - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; - break; - } - write_unlock(&resource_lock); - return res; -} -EXPORT_SYMBOL(__request_region); - -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - kfree(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - -/** - * __release_region - release a previously reserved resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * The described resource region must match a currently busy region. - */ -void __release_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource **p; - resource_size_t end; - - p = &parent->child; - end = start + n - 1; - - write_lock(&resource_lock); - - for (;;) { - struct resource *res = *p; - - if (!res) - break; - if (res->start <= start && res->end >= end) { - if (!(res->flags & IORESOURCE_BUSY)) { - p = &res->child; - continue; - } - if (res->start != start || res->end != end) - break; - *p = res->sibling; - write_unlock(&resource_lock); - if (res->flags & IORESOURCE_MUXED) - wake_up(&muxed_resource_wait); - kfree(res); - return; - } - p = &res->sibling; - } - - write_unlock(&resource_lock); - - printk(KERN_WARNING "Trying to free nonexistent resource " - "<%016llx-%016llx>\n", (unsigned long long)start, - (unsigned long long)end); -} -EXPORT_SYMBOL(__release_region); - -/* - * Managed region resource - */ -struct region_devres { - struct resource *parent; - resource_size_t start; - resource_size_t n; -}; - -static void devm_region_release(struct device *dev, void *res) -{ - struct region_devres *this = res; - - __release_region(this->parent, this->start, this->n); -} - -static int devm_region_match(struct device *dev, void *res, void *match_data) -{ - struct region_devres *this = res, *match = match_data; - - return this->parent == match->parent && - this->start == match->start && this->n == match->n; -} - -struct resource * __devm_request_region(struct device *dev, - struct resource *parent, resource_size_t start, - resource_size_t n, const char *name) -{ - struct region_devres *dr = NULL; - struct resource *res; - - dr = devres_alloc(devm_region_release, sizeof(struct region_devres), - GFP_KERNEL); - if (!dr) - return NULL; - - dr->parent = parent; - dr->start = start; - dr->n = n; - - res = __request_region(parent, start, n, name, 0); - if (res) - devres_add(dev, dr); - else - devres_free(dr); - - return res; -} -EXPORT_SYMBOL(__devm_request_region); - -void __devm_release_region(struct device *dev, struct resource *parent, - resource_size_t start, resource_size_t n) -{ - struct region_devres match_data = { parent, start, n }; - - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, - &match_data)); -} -EXPORT_SYMBOL(__devm_release_region); - -/* - * Called from init/main.c to reserve IO ports. - */ -#define MAXRESERVE 4 -static int __init reserve_setup(char *str) -{ - static int reserved; - static struct resource reserve[MAXRESERVE]; - - for (;;) { - unsigned int io_start, io_num; - int x = reserved; - - if (get_option (&str, &io_start) != 2) - break; - if (get_option (&str, &io_num) == 0) - break; - if (x < MAXRESERVE) { - struct resource *res = reserve + x; - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; - res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) - reserved = x+1; - } - } - return 1; -} - -__setup("reserve=", reserve_setup); - -/* - * Check if the requested addr and size spans more than any slot in the - * iomem resource tree. - */ -int iomem_map_sanity_check(resource_size_t addr, unsigned long size) -{ - struct resource *p = &iomem_resource; - int err = 0; - loff_t l; - - read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ - if (p->start >= addr + size) - continue; - if (p->end < addr) - continue; - if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && - PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) - continue; - /* - * if a resource is "BUSY", it's not a hardware resource - * but a driver mapping of such a resource; we don't want - * to warn for those; some drivers legitimately map only - * partial hardware resources. (example: vesafb) - */ - if (p->flags & IORESOURCE_BUSY) - continue; - - printk(KERN_WARNING "resource map sanity check conflict: " - "0x%llx 0x%llx 0x%llx 0x%llx %s\n", - (unsigned long long)addr, - (unsigned long long)(addr + size - 1), - (unsigned long long)p->start, - (unsigned long long)p->end, - p->name); - err = -1; - break; - } - read_unlock(&resource_lock); - - return err; -} - -#ifdef CONFIG_STRICT_DEVMEM -static int strict_iomem_checks = 1; -#else -static int strict_iomem_checks; -#endif - -/* - * check if an address is reserved in the iomem resource tree - * returns 1 if reserved, 0 if not reserved. - */ -int iomem_is_exclusive(u64 addr) -{ - struct resource *p = &iomem_resource; - int err = 0; - loff_t l; - int size = PAGE_SIZE; - - if (!strict_iomem_checks) - return 0; - - addr = addr & PAGE_MASK; - - read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ - if (p->start >= addr + size) - break; - if (p->end < addr) - continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { - err = 1; - break; - } - } - read_unlock(&resource_lock); - - return err; -} - -static int __init strict_iomem(char *str) -{ - if (strstr(str, "relaxed")) - strict_iomem_checks = 0; - if (strstr(str, "strict")) - strict_iomem_checks = 1; - return 1; -} - -__setup("iomem=", strict_iomem); -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex_common.h" - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk(""); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} - -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || detect || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); -} - -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); - waiter->deadlock_task_pid = NULL; -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; -} - -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); -/* - * RT-Mutexes: simple blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner. - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner - * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt - * Copyright (C) 2006 Esben Nielsen - * - * See Documentation/rt-mutex-design.txt for details. - */ -#include -#include -#include -#include - -#include "rtmutex_common.h" - -/* - * lock->owner state tracking: - * - * lock->owner holds the task_struct pointer of the owner. Bit 0 - * is used to keep track of the "lock has waiters" state. - * - * owner bit0 - * NULL 0 lock is free (fast acquire possible) - * NULL 1 lock is free and has waiters and the top waiter - * is going to take the lock* - * taskpointer 0 lock is held (fast release possible) - * taskpointer 1 lock is held and has waiters** - * - * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 of lock->owner is 0. - * - * (*) It also can be a transitional state when grabbing the lock - * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, - * we need to set the bit0 before looking at the lock, and the owner may be - * NULL in this small time, hence this can be a transitional state. - * - * (**) There is a small time when bit 0 is set but there are no - * waiters. This can happen when grabbing the lock in the slow path. - * To prevent a cmpxchg of the owner releasing the lock, we need to - * set this bit before looking at the lock. - */ - -static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner; - - if (rt_mutex_has_waiters(lock)) - val |= RT_MUTEX_HAS_WAITERS; - - lock->owner = (struct task_struct *)val; -} - -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); -} - -/* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/* - * Calculate task priority from the waiter list priority - * - * Return task->normal_prio when the waiter list is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->pi_list_entry.prio, - task->normal_prio); -} - -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -static void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); -} - -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -/* - * Adjust the priority chain. Also used for deadlock detection. - * Decreases task's usage by one - may thus free the task. - * Returns 0 or -EDEADLK. - */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, - struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) -{ - struct rt_mutex *lock; - struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; - unsigned long flags; - - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); - - /* - * The (de)boosting is a step by step approach with a lot of - * pitfalls. We want this to be preemptible and we want hold a - * maximum of two locks per step. So we have to check - * carefully whether things change under us. - */ - again: - if (++depth > max_lock_depth) { - static int prev_max; - - /* - * Print this only once. If the admin changes the limit, - * print a new message when reaching the limit again. - */ - if (prev_max != max_lock_depth) { - prev_max = max_lock_depth; - printk(KERN_WARNING "Maximum lock depth %d reached " - "task: %s (%d)\n", max_lock_depth, - top_task->comm, task_pid_nr(top_task)); - } - put_task_struct(task); - - return deadlock_detect ? -EDEADLK : 0; - } - retry: - /* - * Task can not go away as we did a get_task() before ! - */ - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - /* - * Check whether the end of the boosting chain has been - * reached or the state of the chain has changed while we - * dropped the locks. - */ - if (!waiter) - goto out_unlock_pi; - - /* - * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock. - */ - if (orig_waiter && !rt_mutex_owner(orig_lock)) - goto out_unlock_pi; - - /* - * Drop out, when the task has no waiters. Note, - * top_waiter can be NULL, when we are in the deboosting - * mode! - */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; - - /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. - */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) - goto out_unlock_pi; - - lock = waiter->lock; - if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - cpu_relax(); - goto retry; - } - - /* Deadlock detection */ - if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; - goto out_unlock_pi; - } - - top_waiter = rt_mutex_top_waiter(lock); - - /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); - - /* Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - if (!rt_mutex_owner(lock)) { - /* - * If the requeue above changed the top waiter, then we need - * to wake the new top waiter up to try to get the lock. - */ - - if (top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); - goto out_put_task; - } - put_task_struct(task); - - /* Grab the next task */ - task = rt_mutex_owner(lock); - get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); - - if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - - } else if (top_waiter == waiter) { - /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); - waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - } - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); - raw_spin_unlock(&lock->wait_lock); - - if (!detect_deadlock && waiter != top_waiter) - goto out_put_task; - - goto again; - - out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - out_put_task: - put_task_struct(task); - - return ret; -} - -/* - * Try to take an rt-mutex - * - * Must be called with lock->wait_lock held. - * - * @lock: the lock to be acquired. - * @task: the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) - */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) -{ - /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. - * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. - * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. - */ - mark_rt_mutex_waiters(lock); - - if (rt_mutex_owner(lock)) - return 0; - - /* - * It will get the lock because of one of these conditions: - * 1) there is no waiter - * 2) higher priority than waiters - * 3) it is top waiter - */ - if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { - if (!waiter || waiter != rt_mutex_top_waiter(lock)) - return 0; - } - } - - if (waiter || rt_mutex_has_waiters(lock)) { - unsigned long flags; - struct rt_mutex_waiter *top; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - /* remove the queued waiter. */ - if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); - task->pi_blocked_on = NULL; - } - - /* - * We have to enqueue the top waiter(if it exists) into - * task->pi_waiters list. - */ - if (rt_mutex_has_waiters(lock)) { - top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); - } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - - /* We got the lock. */ - debug_rt_mutex_lock(lock); - - rt_mutex_set_owner(lock, task); - - rt_mutex_deadlock_account_lock(lock, task); - - return 1; -} - -/* - * Task blocks on lock. - * - * Prepare waiter and propagate pi chain - * - * This must be called with lock->wait_lock held. - */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock) -{ - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; - int chain_walk = 0, res; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - waiter->task = task; - waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - task->pi_blocked_on = waiter; - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - if (!owner) - return 0; - - if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); - - __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) - chain_walk = 1; - - if (!chain_walk) - return 0; - - /* - * The owner can't disappear while holding a lock, - * so the owner struct is protected by wait_lock. - * Gets dropped in rt_mutex_adjust_prio_chain()! - */ - get_task_struct(owner); - - raw_spin_unlock(&lock->wait_lock); - - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); - - raw_spin_lock(&lock->wait_lock); - - return res; -} - -/* - * Wake up the next waiter on the lock. - * - * Remove the top waiter from the current tasks waiter list and wake it up. - * - * Called with lock->wait_lock held. - */ -static void wakeup_next_waiter(struct rt_mutex *lock) -{ - struct rt_mutex_waiter *waiter; - unsigned long flags; - - raw_spin_lock_irqsave(¤t->pi_lock, flags); - - waiter = rt_mutex_top_waiter(lock); - - /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. - */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - - rt_mutex_set_owner(lock, NULL); - - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - - wake_up_process(waiter->task); -} - -/* - * Remove a waiter from a lock and give up - * - * Must be called with lock->wait_lock held and - * have just failed to try_to_take_rt_mutex(). - */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - int first = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; - int chain_walk = 0; - - raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - - if (!owner) - return; - - if (first) { - - raw_spin_lock_irqsave(&owner->pi_lock, flags); - - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); - } - __rt_mutex_adjust_prio(owner); - - if (owner->pi_blocked_on) - chain_walk = 1; - - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - - if (!chain_walk) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - raw_spin_unlock(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - - raw_spin_lock(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); -} - -/** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop - * @lock: the rt_mutex to take - * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) - * @timeout: the pre-initialized and started timer, or NULL for none - * @waiter: the pre-initialized rt_mutex_waiter - * - * lock->wait_lock must be held by the caller. - */ -static int __sched -__rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) -{ - int ret = 0; - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - schedule_rt_mutex(lock); - - raw_spin_lock(&lock->wait_lock); - set_current_state(state); - } - - return ret; -} - -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - struct rt_mutex_waiter waiter; - int ret = 0; - - debug_rt_mutex_init_waiter(&waiter); - - raw_spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); - - if (likely(!ret)) - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - - set_current_state(TASK_RUNNING); - - if (unlikely(ret)) - remove_waiter(lock, &waiter); - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock(&lock->wait_lock); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - debug_rt_mutex_free_waiter(&waiter); - - return ret; -} - -/* - * Slow path try-lock function: - */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) -{ - int ret = 0; - - raw_spin_lock(&lock->wait_lock); - - if (likely(rt_mutex_owner(lock) != current)) { - - ret = try_to_take_rt_mutex(lock, current, NULL); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } - - raw_spin_unlock(&lock->wait_lock); - - return ret; -} - -/* - * Slow path to release a rt-mutex: - */ -static void __sched -rt_mutex_slowunlock(struct rt_mutex *lock) -{ - raw_spin_lock(&lock->wait_lock); - - debug_rt_mutex_unlock(lock); - - rt_mutex_deadlock_account_unlock(current); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; - } - - wakeup_next_waiter(lock); - - raw_spin_unlock(&lock->wait_lock); - - /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); -} - -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, NULL, detect_deadlock); -} - -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, timeout, detect_deadlock); -} - -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 1; - } - return slowfn(lock); -} - -static inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) - rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); -} - -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - might_sleep(); - - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); - -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); - -/** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); - -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * Returns 1 on success and 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -} -EXPORT_SYMBOL_GPL(rt_mutex_trylock); - -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) -{ - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_unlock); - -/** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void rt_mutex_destroy(struct rt_mutex *lock) -{ - WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif -} - -EXPORT_SYMBOL_GPL(rt_mutex_destroy); - -/** - * __rt_mutex_init - initialize the rt lock - * - * @lock: the rt lock to be initialized - * - * Initialize the rt lock to unlocked state. - * - * Initializing of a locked rt lock is not allowed - */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - lock->owner = NULL; - raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); - - debug_rt_mutex_init(lock, name); -} -EXPORT_SYMBOL_GPL(__rt_mutex_init); - -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_init(lock, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); -} - -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); -} - -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * @detect_deadlock: perform deadlock detection (1) or not (0) - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, int detect_deadlock) -{ - int ret; - - raw_spin_lock(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); - return 1; - } - - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; - } - - if (unlikely(ret)) - remove_waiter(lock, waiter); - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - return ret; -} - -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; - - return rt_mutex_top_waiter(lock)->task; -} - -/** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: perform deadlock detection (1) or not (0) - * - * Complete the lock acquisition started our behalf by another thread. - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK - * - * Special API call for PI-futex requeue support - */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock) -{ - int ret; - - raw_spin_lock(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - - set_current_state(TASK_RUNNING); - - if (unlikely(ret)) - remove_waiter(lock, waiter); - - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock(&lock->wait_lock); - - return ret; -} -/* kernel/rwsem.c: R/W semaphores, public implementation - * - * Written by David Howells (dhowells@redhat.com). - * Derived from asm-i386/semaphore.h - */ - -#include -#include -#include -#include -#include - -#include -#include - -/* - * lock for reading - */ -void __sched down_read(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read); - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int down_read_trylock(struct rw_semaphore *sem) -{ - int ret = __down_read_trylock(sem); - - if (ret == 1) - rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_read_trylock); - -/* - * lock for writing - */ -void __sched down_write(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write); - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int down_write_trylock(struct rw_semaphore *sem) -{ - int ret = __down_write_trylock(sem); - - if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_write_trylock); - -/* - * release a read lock - */ -void up_read(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_read(sem); -} - -EXPORT_SYMBOL(up_read); - -/* - * release a write lock - */ -void up_write(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_write(sem); -} - -EXPORT_SYMBOL(up_write); - -/* - * downgrade write lock to read lock - */ -void downgrade_write(struct rw_semaphore *sem) -{ - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ - __downgrade_write(sem); -} - -EXPORT_SYMBOL(downgrade_write); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void down_read_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read_nested); - -void down_write_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write_nested); - -#endif - - -#ifdef CONFIG_SCHED_AUTOGROUP - -#include "sched.h" - -#include -#include -#include -#include -#include -#include - -unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; -static struct autogroup autogroup_default; -static atomic_t autogroup_seq_nr; - -void __init autogroup_init(struct task_struct *init_task) -{ - autogroup_default.tg = &root_task_group; - kref_init(&autogroup_default.kref); - init_rwsem(&autogroup_default.lock); - init_task->signal->autogroup = &autogroup_default; -} - -void autogroup_free(struct task_group *tg) -{ - kfree(tg->autogroup); -} - -static inline void autogroup_destroy(struct kref *kref) -{ - struct autogroup *ag = container_of(kref, struct autogroup, kref); - -#ifdef CONFIG_RT_GROUP_SCHED - /* We've redirected RT tasks to the root task group... */ - ag->tg->rt_se = NULL; - ag->tg->rt_rq = NULL; -#endif - sched_destroy_group(ag->tg); -} - -static inline void autogroup_kref_put(struct autogroup *ag) -{ - kref_put(&ag->kref, autogroup_destroy); -} - -static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) -{ - kref_get(&ag->kref); - return ag; -} - -static inline struct autogroup *autogroup_task_get(struct task_struct *p) -{ - struct autogroup *ag; - unsigned long flags; - - if (!lock_task_sighand(p, &flags)) - return autogroup_kref_get(&autogroup_default); - - ag = autogroup_kref_get(p->signal->autogroup); - unlock_task_sighand(p, &flags); - - return ag; -} - -static inline struct autogroup *autogroup_create(void) -{ - struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); - struct task_group *tg; - - if (!ag) - goto out_fail; - - tg = sched_create_group(&root_task_group); - - if (IS_ERR(tg)) - goto out_free; - - kref_init(&ag->kref); - init_rwsem(&ag->lock); - ag->id = atomic_inc_return(&autogroup_seq_nr); - ag->tg = tg; -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Autogroup RT tasks are redirected to the root task group - * so we don't have to move tasks around upon policy change, - * or flail around trying to allocate bandwidth on the fly. - * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. - */ - free_rt_sched_group(tg); - tg->rt_se = root_task_group.rt_se; - tg->rt_rq = root_task_group.rt_rq; -#endif - tg->autogroup = ag; - - return ag; - -out_free: - kfree(ag); -out_fail: - if (printk_ratelimit()) { - printk(KERN_WARNING "autogroup_create: %s failure.\n", - ag ? "sched_create_group()" : "kmalloc()"); - } - - return autogroup_kref_get(&autogroup_default); -} - -bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) -{ - if (tg != &root_task_group) - return false; - - if (p->sched_class != &fair_sched_class) - return false; - - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. - */ - if (p->flags & PF_EXITING) - return false; - - return true; -} - -static void -autogroup_move_group(struct task_struct *p, struct autogroup *ag) -{ - struct autogroup *prev; - struct task_struct *t; - unsigned long flags; - - BUG_ON(!lock_task_sighand(p, &flags)); - - prev = p->signal->autogroup; - if (prev == ag) { - unlock_task_sighand(p, &flags); - return; - } - - p->signal->autogroup = autogroup_kref_get(ag); - - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - - t = p; - do { - sched_move_task(t); - } while_each_thread(p, t); - -out: - unlock_task_sighand(p, &flags); - autogroup_kref_put(prev); -} - -/* Allocates GFP_KERNEL, cannot be called under any spinlock */ -void sched_autogroup_create_attach(struct task_struct *p) -{ - struct autogroup *ag = autogroup_create(); - - autogroup_move_group(p, ag); - /* drop extra reference added by autogroup_create() */ - autogroup_kref_put(ag); -} -EXPORT_SYMBOL(sched_autogroup_create_attach); - -/* Cannot be called under siglock. Currently has no users */ -void sched_autogroup_detach(struct task_struct *p) -{ - autogroup_move_group(p, &autogroup_default); -} -EXPORT_SYMBOL(sched_autogroup_detach); - -void sched_autogroup_fork(struct signal_struct *sig) -{ - sig->autogroup = autogroup_task_get(current); -} - -void sched_autogroup_exit(struct signal_struct *sig) -{ - autogroup_kref_put(sig->autogroup); -} - -static int __init setup_autogroup(char *str) -{ - sysctl_sched_autogroup_enabled = 0; - - return 1; -} - -__setup("noautogroup", setup_autogroup); - -#ifdef CONFIG_PROC_FS - -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) -{ - static unsigned long next = INITIAL_JIFFIES; - struct autogroup *ag; - int err; - - if (*nice < -20 || *nice > 19) - return -EINVAL; - - err = security_task_setnice(current, *nice); - if (err) - return err; - - if (*nice < 0 && !can_nice(current, *nice)) - return -EPERM; - - /* this is a heavy operation taking global locks.. */ - if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) - return -EAGAIN; - - next = HZ / 10 + jiffies; - ag = autogroup_task_get(p); - - down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); - if (!err) - ag->nice = *nice; - up_write(&ag->lock); - - autogroup_kref_put(ag); - - return err; -} - -void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) -{ - struct autogroup *ag = autogroup_task_get(p); - - if (!task_group_is_autogroup(ag->tg)) - goto out; - - down_read(&ag->lock); - seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); - up_read(&ag->lock); - -out: - autogroup_kref_put(ag); -} -#endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_SCHED_DEBUG -int autogroup_path(struct task_group *tg, char *buf, int buflen) -{ - if (!task_group_is_autogroup(tg)) - return 0; - - return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); -} -#endif /* CONFIG_SCHED_DEBUG */ - -#endif /* CONFIG_SCHED_AUTOGROUP */ -/* - * sched_clock for unstable cpu clocks - * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra - * - * Updates and enhancements: - * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt - * - * Based on code by: - * Ingo Molnar - * Guillaume Chazarain - * - * - * What: - * - * cpu_clock(i) provides a fast (execution time) high resolution - * clock with bounded drift between CPUs. The value of cpu_clock(i) - * is monotonic for constant i. The timestamp returned is in nanoseconds. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - * - * There is no strict promise about the base, although it tends to start - * at 0 on boot (but people really shouldn't rely on that). - * - * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) - * local_clock() -- is cpu_clock() on the current cpu. - * - * How: - * - * The implementation either uses sched_clock() when - * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the - * sched_clock() is assumed to provide these properties (mostly it means - * the architecture provides a globally synchronized highres time source). - * - * Otherwise it tries to create a semi stable clock from a mixture of other - * clocks, including: - * - * - GTOD (clock monotomic) - * - sched_clock() - * - explicit idle events - * - * We use GTOD as base and use sched_clock() deltas to improve resolution. The - * deltas are filtered to provide monotonicity and keeping it within an - * expected window. - * - * Furthermore, explicit sleep and wakeup hooks allow us to account for time - * that is otherwise invisible (TSC gets stopped). - * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). - */ -#include -#include -#include -#include -#include -#include - -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)(jiffies - INITIAL_JIFFIES) - * (NSEC_PER_SEC / HZ); -} -EXPORT_SYMBOL_GPL(sched_clock); - -__read_mostly int sched_clock_running; - -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; - -struct sched_clock_data { - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); - -static inline struct sched_clock_data *this_scd(void) -{ - return &__get_cpu_var(sched_clock_data); -} - -static inline struct sched_clock_data *cpu_sdc(int cpu) -{ - return &per_cpu(sched_clock_data, cpu); -} - -void sched_clock_init(void) -{ - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; -} - -/* - * min, max except they take wrapping into account - */ - -static inline u64 wrap_min(u64 x, u64 y) -{ - return (s64)(x - y) < 0 ? x : y; -} - -static inline u64 wrap_max(u64 x, u64 y) -{ - return (s64)(x - y) > 0 ? x : y; -} - -/* - * update the percpu scd from the raw @now value - * - * - filter out backward motion - * - use the GTOD tick value to create a window to filter crazy TSC values - */ -static u64 sched_clock_local(struct sched_clock_data *scd) -{ - u64 now, clock, old_clock, min_clock, max_clock; - s64 delta; - -again: - now = sched_clock(); - delta = now - scd->tick_raw; - if (unlikely(delta < 0)) - delta = 0; - - old_clock = scd->clock; - - /* - * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); - */ - - clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, old_clock); - max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); - - clock = wrap_max(clock, min_clock); - clock = wrap_min(clock, max_clock); - - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) - goto again; - - return clock; -} - -static u64 sched_clock_remote(struct sched_clock_data *scd) -{ - struct sched_clock_data *my_scd = this_scd(); - u64 this_clock, remote_clock; - u64 *ptr, old_val, val; - - sched_clock_local(my_scd); -again: - this_clock = my_scd->clock; - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - ptr = &scd->clock; - old_val = remote_clock; - val = this_clock; - } else { - /* - * Should be rare, but possible: - */ - ptr = &my_scd->clock; - old_val = this_clock; - val = remote_clock; - } - - if (cmpxchg64(ptr, old_val, val) != old_val) - goto again; - - return val; -} - -/* - * Similar to cpu_clock(), but requires local IRQs to be disabled. - * - * See cpu_clock(). - */ -u64 sched_clock_cpu(int cpu) -{ - struct sched_clock_data *scd; - u64 clock; - - WARN_ON_ONCE(!irqs_disabled()); - - if (sched_clock_stable) - return sched_clock(); - - if (unlikely(!sched_clock_running)) - return 0ull; - - scd = cpu_sdc(cpu); - - if (cpu != smp_processor_id()) - clock = sched_clock_remote(scd); - else - clock = sched_clock_local(scd); - - return clock; -} - -void sched_clock_tick(void) -{ - struct sched_clock_data *scd; - u64 now, now_gtod; - - if (sched_clock_stable) - return; - - if (unlikely(!sched_clock_running)) - return; - - WARN_ON_ONCE(!irqs_disabled()); - - scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); - - scd->tick_raw = now; - scd->tick_gtod = now_gtod; - sched_clock_local(scd); -} - -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - sched_clock_cpu(smp_processor_id()); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - if (timekeeping_suspended) - return; - - sched_clock_tick(); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); - - return clock; -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); - - return clock; -} - -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - -u64 sched_clock_cpu(int cpu) -{ - if (unlikely(!sched_clock_running)) - return 0; - - return sched_clock(); -} - -u64 cpu_clock(int cpu) -{ - return sched_clock_cpu(cpu); -} - -u64 local_clock(void) -{ - return sched_clock_cpu(0); -} - -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); -/* - * kernel/sched/core.c - * - * Kernel scheduler and related syscalls - * - * Copyright (C) 1991-2002 Linus Torvalds - * - * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and - * make semaphores SMP safe - * 1998-11-19 Implemented schedule_timeout() and related stuff - * by Andrea Arcangeli - * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: - * hybrid priority-list and round-robin design with - * an array-switch method of distributing timeslices - * and per-CPU runqueues. Cleanups and useful suggestions - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin - * 2007-04-15 Work begun on replacing all interactivity tuning with a - * fair scheduling design by Con Kolivas. - * 2007-05-05 Load balancing (smp-nice) and other improvements - * by Peter Williams - * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith - * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri - * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, - * Thomas Gleixner, Mike Kravetz - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#ifdef CONFIG_PARAVIRT -#include -#endif - -#include "sched.h" -#include "../workqueue_sched.h" - -#define CREATE_TRACE_POINTS -#include - -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - -DEFINE_MUTEX(sched_domains_mutex); -DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -static void update_rq_clock_task(struct rq *rq, s64 delta); - -void update_rq_clock(struct rq *rq) -{ - s64 delta; - - if (rq->skip_clock_update > 0) - return; - - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; - rq->clock += delta; - update_rq_clock_task(rq, delta); -} - -/* - * Debugging: various feature bits - */ - -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | - -const_debug unsigned int sysctl_sched_features = -#include "features.h" - 0; - -#undef SCHED_FEAT - -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static __read_mostly char *sched_feat_names[] = { -#include "features.h" - NULL -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int neg = 0; - int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - -/* - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; - -/* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - -/* - * period over which we measure -rt task cpu usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - -__read_mostly int scheduler_running; - -/* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - - -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} - -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - -#ifdef CONFIG_SCHED_HRTICK -/* - * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. - */ - -static void hrtick_clear(struct rq *rq) -{ - if (hrtimer_active(&rq->hrtick_timer)) - hrtimer_cancel(&rq->hrtick_timer); -} - -/* - * High-resolution timer tick. - * Runs from hardirq context with interrupts disabled. - */ -static enum hrtimer_restart hrtick(struct hrtimer *timer) -{ - struct rq *rq = container_of(timer, struct rq, hrtick_timer); - - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - - raw_spin_lock(&rq->lock); - update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); - - return HRTIMER_NORESTART; -} - -#ifdef CONFIG_SMP -/* - * called from hardirq (IPI) context - */ -static void __hrtick_start(void *arg) -{ - struct rq *rq = arg; - - raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); - rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); -} - -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - - hrtimer_set_expires(timer, time); - - if (rq == this_rq()) { - hrtimer_restart(timer); - } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); - rq->hrtick_csd_pending = 1; - } -} - -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and irqs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SMP */ - -static void init_rq_hrtick(struct rq *rq) -{ -#ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; -#endif - - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rq->hrtick_timer.function = hrtick; -} -#else /* CONFIG_SCHED_HRTICK */ -static inline void hrtick_clear(struct rq *rq) -{ -} - -static inline void init_rq_hrtick(struct rq *rq) -{ -} - -static inline void init_hrtick(void) -{ -} -#endif /* CONFIG_SCHED_HRTICK */ - -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -void resched_task(struct task_struct *p) -{ - int cpu; - - assert_raw_spin_locked(&task_rq(p)->lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} - -void resched_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; - resched_task(cpu_curr(cpu)); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -#ifdef CONFIG_NO_HZ -/* - * In the semi idle case, use the nearest busy cpu for migrating timers - * from an idle cpu. This is good for power-savings. - * - * We don't do similar optimization for completely idle system, as - * selecting an idle cpu will add more delays to the timers than intended - * (as that cpu's timer base may not be uptodate wrt jiffies etc). - */ -int get_nohz_timer_target(void) -{ - int cpu = smp_processor_id(); - int i; - struct sched_domain *sd; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } - } - } -unlock: - rcu_read_unlock(); - return cpu; -} -/* - * When add_timer_on() enqueues a timer into the timer wheel of an - * idle CPU then this timer might expire before the next timer event - * which is scheduled to wake up that CPU. In case of a completely - * idle system the next event might even be infinite time into the - * future. wake_up_idle_cpu() ensures that the CPU is woken up and - * leaves the inner idle loop so the newly added timer is taken into - * account when the CPU goes back to idle and evaluates the timer - * wheel for the next timer event. - */ -void wake_up_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (cpu == smp_processor_id()) - return; - - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) - smp_send_reschedule(cpu); -} - -static inline bool got_nohz_idle_kick(void) -{ - int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); -} - -#else /* CONFIG_NO_HZ */ - -static inline bool got_nohz_idle_kick(void) -{ - return false; -} - -#endif /* CONFIG_NO_HZ */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq->clock - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif /* CONFIG_SMP */ - -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) -/* - * Iterate task_group tree rooted at *from, calling @down when first entering a - * node and @up when leaving it for the final time. - * - * Caller must hold rcu_lock or sufficient equivalent. - */ -int walk_tg_tree_from(struct task_group *from, - tg_visitor down, tg_visitor up, void *data) -{ - struct task_group *parent, *child; - int ret; - - parent = from; - -down: - ret = (*down)(parent, data); - if (ret) - goto out; - list_for_each_entry_rcu(child, &parent->children, siblings) { - parent = child; - goto down; - -up: - continue; - } - ret = (*up)(parent, data); - if (ret || parent == from) - goto out; - - child = parent; - parent = parent->parent; - if (parent) - goto up; -out: - return ret; -} - -int tg_nop(struct task_group *tg, void *data) -{ - return 0; -} -#endif - -void update_cpu_load(struct rq *this_rq); - -static void set_load_weight(struct task_struct *p) -{ - int prio = p->static_prio - MAX_RT_PRIO; - struct load_weight *load = &p->se.load; - - /* - * SCHED_IDLE tasks get minimal weight: - */ - if (p->policy == SCHED_IDLE) { - load->weight = scale_load(WEIGHT_IDLEPRIO); - load->inv_weight = WMULT_IDLEPRIO; - return; - } - - load->weight = scale_load(prio_to_weight[prio]); - load->inv_weight = prio_to_wmult[prio]; -} - -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -{ - update_rq_clock(rq); - sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, flags); -} - -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -{ - update_rq_clock(rq); - sched_info_dequeued(p); - p->sched_class->dequeue_task(rq, p, flags); -} - -void activate_task(struct rq *rq, struct task_struct *p, int flags) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, flags); -} - -void deactivate_task(struct rq *rq, struct task_struct *p, int flags) -{ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - - dequeue_task(rq, p, flags); -} - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ -/* - * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... - */ -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - s64 steal = 0, irq_delta = 0; -#endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { - u64 st; - - steal = paravirt_steal_clock(cpu_of(rq)); - steal -= rq->prev_steal_time_rq; - - if (unlikely(steal > delta)) - steal = delta; - - st = steal_ticks(steal); - steal = st * TICK_NSEC; - - rq->prev_steal_time_rq += steal; - - delta -= steal; - } -#endif - - rq->clock_task += delta; - -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) - sched_rt_avg_update(rq, irq_delta + steal); -#endif -} - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; -} - -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - int prio; - - if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; -} - -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) -{ - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; -} - -/** - * task_curr - is this task currently executing on a CPU? - * @p: the task in question. - */ -inline int task_curr(const struct task_struct *p) -{ - return cpu_curr(task_cpu(p)) == p; -} - -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) - p->sched_class->prio_changed(rq, p, oldprio); -} - -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) -{ - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { - rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_task(rq->curr); - break; - } - } - } - - /* - * A queue event has occurred, and we're going to schedule. In - * this case, we can save a useless back to back clock update. - */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; -} - -#ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ -#ifdef CONFIG_SCHED_DEBUG - /* - * We should never call set_task_cpu() on a blocked task, - * ttwu() will sort out the placement. - */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); - -#ifdef CONFIG_LOCKDEP - /* - * The caller should hold either p->pi_lock or rq->lock, when changing - * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. - * - * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). - * - * Furthermore, all task_rq users should acquire both locks, see - * task_rq_lock(). - */ - WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); -#endif -#endif - - trace_sched_migrate_task(p, new_cpu); - - if (task_cpu(p) != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); - } - - __set_task_cpu(p, new_cpu); -} - -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); - -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - unsigned long flags; - int running, on_rq; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - trace_sched_wait_task(p); - running = task_running(rq, p); - on_rq = p->on_rq; - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesn't have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); -} -EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_SMP -/* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock - */ -static int select_fallback_rq(int cpu, struct task_struct *p) -{ - int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); - - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - return dest_cpu; - - /* No more Mr. Nice Guy. */ - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } - - return dest_cpu; -} - -/* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. - */ -static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) -{ - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); - - /* - * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed - * cpu. - * - * Since this is common to all placement strategies, this lives here. - * - * [ this allows ->select_task() to simply return task_cpu(p) and - * not worry about this generic constraint ] - */ - if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); - - return cpu; -} - -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} -#endif - -static void -ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -{ -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); - -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); - } else { - struct sched_domain *sd; - - schedstat_inc(p, se.statistics.nr_wakeups_remote); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - rcu_read_unlock(); - } - - if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - -#endif /* CONFIG_SMP */ - - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); - - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ -} - -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = 1; - - /* if a worker is waking up, notify workqueue */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); -} - -/* - * Mark the task runnable and perform wakeup-preemption. - */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -{ - trace_sched_wakeup(p, true); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif -} - -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -{ -#ifdef CONFIG_SMP - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; -#endif - - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); -} - -/* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. - */ -static int ttwu_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_rq) { - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; -} - -#ifdef CONFIG_SMP -static void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; - - raw_spin_lock(&rq->lock); - - while (llist) { - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); - } - - raw_spin_unlock(&rq->lock); -} - -void scheduler_ipi(void) -{ - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); -} - -static void ttwu_queue_remote(struct task_struct *p, int cpu) -{ - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); -} - -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; - -} -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - -static inline int ttwu_share_cache(int this_cpu, int that_cpu) -{ - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -} -#endif /* CONFIG_SMP */ - -static void ttwu_queue(struct task_struct *p, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - -#if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); - return; - } -#endif - - raw_spin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - raw_spin_unlock(&rq->lock); -} - -/** - * try_to_wake_up - wake up a thread - * @p: the thread to be awakened - * @state: the mask of task states that can be woken - * @wake_flags: wake modifier flags (WF_*) - * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * Returns %true if @p was woken up, %false if it was already running - * or @state didn't match @p's state. - */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -{ - unsigned long flags; - int cpu, success = 0; - - smp_wmb(); - raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) - goto out; - - success = 1; /* we're going to change ->state */ - cpu = task_cpu(p); - - if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; - -#ifdef CONFIG_SMP - /* - * If the owning (remote) cpu is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. - */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else - cpu_relax(); -#endif - } - /* - * Pairs with the smp_wmb() in finish_lock_switch(). - */ - smp_rmb(); - - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) { - wake_flags |= WF_MIGRATED; - set_task_cpu(p, cpu); - } -#endif /* CONFIG_SMP */ - - ttwu_queue(p, cpu); -stat: - ttwu_stat(p, cpu, wake_flags); -out: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - - return success; -} - -/** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - BUG_ON(rq != this_rq()); - BUG_ON(p == current); - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - raw_spin_unlock(&rq->lock); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - if (!p->on_rq) - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** - * wake_up_process - Wake up a specific process - * @p: The process to be woken up. - * - * Attempt to wake up the nominated process and move it to the set of runnable - * processes. Returns 1 if the process was woken up, 0 if it was already - * running. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -int wake_up_process(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_ALL, 0); -} -EXPORT_SYMBOL(wake_up_process); - -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); -} - -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - * - * __sched_fork() is basic setup used by init_idle() too: - */ -static void __sched_fork(struct task_struct *p) -{ - p->on_rq = 0; - - p->se.on_rq = 0; - p->se.exec_start = 0; - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->se.nr_migrations = 0; - p->se.vruntime = 0; - INIT_LIST_HEAD(&p->se.group_node); - -#ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); -#endif - - INIT_LIST_HEAD(&p->rt.run_list); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif -} - -/* - * fork()/clone()-time setup: - */ -void sched_fork(struct task_struct *p) -{ - unsigned long flags; - int cpu = get_cpu(); - - __sched_fork(p); - /* - * We mark the process as running here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; - - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* - * Revert to default priority/policy on fork if requested. - */ - if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { - p->policy = SCHED_NORMAL; - p->static_prio = NICE_TO_PRIO(0); - p->rt_priority = 0; - } else if (PRIO_TO_NICE(p->static_prio) < 0) - p->static_prio = NICE_TO_PRIO(0); - - p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); - - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - } - - if (!rt_prio(p->prio)) - p->sched_class = &fair_sched_class; - - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (likely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif -#if defined(CONFIG_SMP) - p->on_cpu = 0; -#endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif -#ifdef CONFIG_SMP - plist_node_init(&p->pushable_tasks, MAX_PRIO); -#endif - - put_cpu(); -} - -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - - raw_spin_lock_irqsave(&p->pi_lock, flags); -#ifdef CONFIG_SMP - /* - * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path - * - any previously selected cpu might disappear through hotplug - */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); -#endif - - rq = __task_rq_lock(p); - activate_task(rq, p, 0); - p->on_rq = 1; - trace_sched_wakeup_new(p, true); - check_preempt_curr(rq, p, WF_FORK); -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); -#endif - task_rq_unlock(rq, p, &flags); -} - -#ifdef CONFIG_PREEMPT_NOTIFIERS - -/** - * preempt_notifier_register - tell me when current is being preempted & rescheduled - * @notifier: notifier struct to register - */ -void preempt_notifier_register(struct preempt_notifier *notifier) -{ - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -} -EXPORT_SYMBOL_GPL(preempt_notifier_register); - -/** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister - * - * This is safe to call from within a preemption notifier. - */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @prev: the current task that is being switched out - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); - trace_sched_switch(prev, next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) - __releases(rq->lock) -{ - struct mm_struct *mm = rq->prev_mm; - long prev_state; - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul - */ - prev_state = prev->state; - finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - finish_lock_switch(rq, prev); - - fire_sched_in_preempt_notifiers(current); - if (mm) - mmdrop(mm); - if (unlikely(prev_state == TASK_DEAD)) { - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } -} - -#ifdef CONFIG_SMP - -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - -/* rq->lock is NOT held, but preemption is disabled */ -static inline void post_schedule(struct rq *rq) -{ - if (rq->post_schedule) { - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->curr->sched_class->post_schedule) - rq->curr->sched_class->post_schedule(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); - - rq->post_schedule = 0; - } -} - -#else - -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - -static inline void post_schedule(struct rq *rq) -{ -} - -#endif - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage void schedule_tail(struct task_struct *prev) - __releases(rq->lock) -{ - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); - - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ - post_schedule(rq); - -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); -} - -/* - * context_switch - switch to the new MM and the new - * thread's register state. - */ -static inline void -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - struct mm_struct *mm, *oldmm; - - prepare_task_switch(rq, prev, next); - - mm = next->mm; - oldmm = prev->active_mm; - /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. - */ - arch_start_context_switch(prev); - - if (!mm) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); - - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; - } - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif - - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); -} - -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. - */ -unsigned long nr_running(void) -{ - unsigned long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; - - return sum; -} - -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - -unsigned long long nr_context_switches(void) -{ - int i; - unsigned long long sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; - - return sum; -} - -unsigned long nr_iowait(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); - - return sum; -} - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ -/* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_tasks_idle; - -void calc_load_account_idle(struct rq *this_rq) -{ - long delta; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); -} - -static long calc_load_fold_idle(void) -{ - long delta = 0; - - /* - * Its got a race, we don't care... - */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(unsigned long ticks) -{ - long delta, active, n; - - if (time_before(jiffies, calc_load_update)) - return; - - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * If we were idle for multiple load cycles, apply them. - */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} - -static inline long calc_load_fold_idle(void) -{ - return 0; -} - -static void calc_global_nohz(unsigned long ticks) -{ -} -#endif - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active; - - calc_global_nohz(ticks); - - if (time_before(jiffies, calc_load_update + 10)) - return; - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -void update_cpu_load(struct rq *this_rq) -{ - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; - int i, scale; - - this_rq->nr_load_updates++; - - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -static void update_cpu_load_active(struct rq *this_rq) -{ - update_cpu_load(this_rq); - - calc_load_account_active(this_rq); -} - -#ifdef CONFIG_SMP - -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - struct task_struct *p = current; - unsigned long flags; - int dest_cpu; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); - if (dest_cpu == smp_processor_id()) - goto unlock; - - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; - - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; - } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -} - -#endif - -DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); - -EXPORT_PER_CPU_SYMBOL(kstat); -EXPORT_PER_CPU_SYMBOL(kernel_cpustat); - -/* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - if (task_current(rq, p)) { - update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* - * Return accounted runtime for the task. - * In case the task is currently running, return the runtime plus current's - * pending runtime that have not been accounted yet. - */ -unsigned long long task_sched_runtime(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; -} -#endif - -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - */ -void scheduler_tick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; - - sched_clock_tick(); - - raw_spin_lock(&rq->lock); - update_rq_clock(rq); - update_cpu_load_active(rq); - curr->sched_class->task_tick(rq, curr, 0); - raw_spin_unlock(&rq->lock); - - perf_event_task_tick(); - -#ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); -#endif -} - -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -void __kprobes add_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - preempt_count() += val; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -} -EXPORT_SYMBOL(add_preempt_count); - -void __kprobes sub_preempt_count(int val) -{ -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif - - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; -} -EXPORT_SYMBOL(sub_preempt_count); - -#endif - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) -{ - struct pt_regs *regs = get_irq_regs(); - - if (oops_in_progress) - return; - - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); - - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); -} - -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) -{ - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) - __schedule_bug(prev); - rcu_sleep_check(); - - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - - schedstat_inc(this_rq(), sched_count); -} - -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - -/* - * Pick up the highest-prio task: - */ -static inline struct task_struct * -pick_next_task(struct rq *rq) -{ - const struct sched_class *class; - struct task_struct *p; - - /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: - */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; - } - - for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; - } - - BUG(); /* the idle class will always have a runnable task */ -} - -/* - * __schedule() is the main scheduler function. - */ -static void __sched __schedule(void) -{ - struct task_struct *prev, *next; - unsigned long *switch_count; - struct rq *rq; - int cpu; - -need_resched: - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); - prev = rq->curr; - - schedule_debug(prev); - - if (sched_feat(HRTICK)) - hrtick_clear(rq); - - raw_spin_lock_irq(&rq->lock); - - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) { - prev->state = TASK_RUNNING; - } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); - prev->on_rq = 0; - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev, cpu); - if (to_wakeup) - try_to_wake_up_local(to_wakeup); - } - } - switch_count = &prev->nvcsw; - } - - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); - - put_prev_task(rq, prev); - next = pick_next_task(rq); - clear_tsk_need_resched(prev); - rq->skip_clock_update = 0; - - if (likely(prev != next)) { - rq->nr_switches++; - rq->curr = next; - ++*switch_count; - - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - } else - raw_spin_unlock_irq(&rq->lock); - - post_schedule(rq); - - preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; -} - -static inline void sched_submit_work(struct task_struct *tsk) -{ - if (!tsk->state) - return; - /* - * If we are going to sleep and we have plugged IO queued, - * make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); -} - -asmlinkage void __sched schedule(void) -{ - struct task_struct *tsk = current; - - sched_submit_work(tsk); - __schedule(); -} -EXPORT_SYMBOL(schedule); - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage void __sched notrace preempt_schedule(void) -{ - struct thread_info *ti = current_thread_info(); - - /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. - */ - if (likely(ti->preempt_count || irqs_disabled())) - return; - - do { - add_preempt_count_notrace(PREEMPT_ACTIVE); - __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); -} -EXPORT_SYMBOL(preempt_schedule); - -/* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. - */ -asmlinkage void __sched preempt_schedule_irq(void) -{ - struct thread_info *ti = current_thread_info(); - - /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); - - do { - add_preempt_count(PREEMPT_ACTIVE); - local_irq_enable(); - __schedule(); - local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); -} - -#endif /* CONFIG_PREEMPT */ - -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, - void *key) -{ - return try_to_wake_up(curr->private, mode, wake_flags); -} -EXPORT_SYMBOL(default_wake_function); - -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) -{ - __wake_up_common(q, mode, 1, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = WF_SYNC; - - if (unlikely(!q)) - return; - - if (unlikely(!nr_exclusive)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Returns: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Returns: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); - -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - -#ifdef CONFIG_RT_MUTEXES - -/* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance logic. - */ -void rt_mutex_setprio(struct task_struct *p, int prio) -{ - int oldprio, on_rq, running; - struct rq *rq; - const struct sched_class *prev_class; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = __task_rq_lock(p); - - trace_sched_pi_setprio(p, prio); - oldprio = p->prio; - prev_class = p->sched_class; - on_rq = p->on_rq; - running = task_current(rq, p); - if (on_rq) - dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - if (rt_prio(prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - - p->prio = prio; - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - - check_class_changed(rq, p, prev_class, oldprio); - __task_rq_unlock(rq); -} - -#endif - -void set_user_nice(struct task_struct *p, long nice) -{ - int old_prio, delta, on_rq; - unsigned long flags; - struct rq *rq; - - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = task_rq_lock(p, &flags); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: - */ - if (task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } - on_rq = p->on_rq; - if (on_rq) - dequeue_task(rq, p, 0); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); - delta = p->prio - old_prio; - - if (on_rq) { - enqueue_task(rq, p, 0); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); - } -out_unlock: - task_rq_unlock(rq, p, &flags); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - - nice = TASK_NICE(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; - - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * This is the priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * task_nice - return the nice value of a given task. - * @p: the task in question. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - */ -int idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) - return 0; - -#ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) - return 0; -#endif - - return 1; -} - -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* Actually do priority change: must hold rq lock. */ -static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) -{ - p->policy = policy; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - set_load_weight(p); -} - -/* - * check the target process has a UID that matches the current process's - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - if (cred->user->user_ns == pcred->user->user_ns) - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); - else - match = false; - rcu_read_unlock(); - return match; -} - -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) -{ - int retval, oldprio, oldpolicy = -1, on_rq, running; - unsigned long flags; - const struct sched_class *prev_class; - struct rq *rq; - int reset_on_fork; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -recheck: - /* double check policy once rq lock held */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; - - if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; - } - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; - } - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) - return -EPERM; - } - - /* can't change other user's priorities */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - - if (user) { - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - * - * To be able to change p->policy safely, the appropriate - * runqueue lock must be held. - */ - rq = task_rq_lock(p, &flags); - - /* - * Changing the policy of the stop threads its a very bad idea - */ - if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); - return -EINVAL; - } - - /* - * If not changing anything there's no need to proceed further: - */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return 0; - } - -#ifdef CONFIG_RT_GROUP_SCHED - if (user) { - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0 && - !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); - return -EPERM; - } - } -#endif - - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); - goto recheck; - } - on_rq = p->on_rq; - running = task_current(rq, p); - if (on_rq) - dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; - prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); - - if (running) - p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); - - check_class_changed(rq, p, prev_class, oldprio); - task_rq_unlock(rq, p, &flags); - - rt_mutex_adjust_pi(p); - - return 0; -} - -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, true); -} -EXPORT_SYMBOL_GPL(sched_setscheduler); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return __sched_setscheduler(p, policy, param, false); -} - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, - struct sched_param __user *, param) -{ - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, -1, param); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); - } - rcu_read_unlock(); - return retval; -} - -/** - * sys_sched_getparam - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - put_online_cpus(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; - - retval = security_task_setscheduler(p); - if (retval) - goto out_unlock; - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); -again: - retval = set_cpus_allowed_ptr(p, new_mask); - - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_unlock: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - put_online_cpus(); - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) -{ - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); - - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, struct cpumask *mask) -{ - struct task_struct *p; - unsigned long flags; - int retval; - - get_online_cpus(); - rcu_read_lock(); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: - rcu_read_unlock(); - put_online_cpus(); - - return retval; -} - -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, mask, retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct rq *rq = this_rq_lock(); - - schedstat_inc(rq, yld_count); - current->sched_class->yield_task(rq); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); - - schedule(); - - return 0; -} - -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - -static void __cond_resched(void) -{ - add_preempt_count(PREEMPT_ACTIVE); - __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); -} - -int __sched _cond_resched(void) -{ - if (should_resched()) { - __cond_resched(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(_cond_resched); - -/* - * __cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int __cond_resched_lock(spinlock_t *lock) -{ - int resched = should_resched(); - int ret = 0; - - lockdep_assert_held(lock); - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched) - __cond_resched(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; -} -EXPORT_SYMBOL(__cond_resched_lock); - -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched()) { - local_bh_enable(); - __cond_resched(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - -/** - * yield - yield the current processor to other threads. - * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - sys_sched_yield(); -} -EXPORT_SYMBOL(yield); - -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Returns true if we indeed boosted the target task. - */ -bool __sched yield_to(struct task_struct *p, bool preempt) -{ - struct task_struct *curr = current; - struct rq *rq, *p_rq; - unsigned long flags; - bool yielded = 0; - - local_irq_save(flags); - rq = this_rq(); - -again: - p_rq = task_rq(p); - double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } - - if (!curr->sched_class->yield_to_task) - goto out; - - if (curr->sched_class != p->sched_class) - goto out; - - if (task_running(p_rq, p) || p->state) - goto out; - - yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) { - schedstat_inc(rq, yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; - } - -out: - double_rq_unlock(rq, p_rq); - local_irq_restore(flags); - - if (yielded) - schedule(); - - return yielded; -} -EXPORT_SYMBOL_GPL(yield_to); - -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - -long __sched io_schedule_timeout(long timeout) -{ - struct rq *rq = raw_rq(); - long ret; - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - ret = schedule_timeout(timeout); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); - return ret; -} - -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) -{ - struct task_struct *p; - unsigned int time_slice; - unsigned long flags; - struct rq *rq; - int retval; - struct timespec t; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - rq = task_rq_lock(p, &flags); - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); - - rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - -void sched_show_task(struct task_struct *p) -{ - unsigned long free = 0; - unsigned state; - - state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - free = stack_not_used(p); -#endif - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), - (unsigned long)task_thread_info(p)->flags); - - show_stack(p, NULL); -} - -void show_state_filter(unsigned long state_filter) -{ - struct task_struct *g, *p; - -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - rcu_read_lock(); - do_each_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take a lot of time: - */ - touch_nmi_watchdog(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); - } while_each_thread(g, p); - - touch_all_softlockup_watchdogs(); - -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); -#endif - rcu_read_unlock(); - /* - * Only show locks if all tasks are dumped: - */ - if (!state_filter) - debug_show_all_locks(); -} - -void __cpuinit init_idle_bootup_task(struct task_struct *idle) -{ - idle->sched_class = &idle_sched_class; -} - -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. - */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - - __sched_fork(idle); - idle->state = TASK_RUNNING; - idle->se.exec_start = sched_clock(); - - do_set_cpus_allowed(idle, cpumask_of(cpu)); - /* - * We're having a chicken and egg problem, even though we are - * holding rq->lock, the cpu isn't yet set to this cpu so the - * lockdep check in task_group() will fail. - * - * Similar case to sched_fork(). / Alternatively we could - * use task_rq_lock() here and obtain the other rq->lock. - * - * Silence PROVE_RCU - */ - rcu_read_lock(); - __set_task_cpu(idle, cpu); - rcu_read_unlock(); - - rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) - idle->on_cpu = 1; -#endif - raw_spin_unlock_irqrestore(&rq->lock, flags); - - /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; - - /* - * The idle tasks have their own, simple scheduling class: - */ - idle->sched_class = &idle_sched_class; - ftrace_graph_init_idle_task(idle, cpu); -#if defined(CONFIG_SMP) - sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif -} - -#ifdef CONFIG_SMP -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (p->sched_class && p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); -} - -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ - -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } - - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - - do_set_cpus_allowed(p, new_mask); - - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; - - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } -out: - task_rq_unlock(rq, p, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); - -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) -{ - struct rq *rq_dest, *rq_src; - int ret = 0; - - if (unlikely(!cpu_active(dest_cpu))) - return ret; - - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); - - raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; - - /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. - */ - if (p->on_rq) { - dequeue_task(rq_src, p, 0); - set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } -done: - ret = 1; -fail: - double_rq_unlock(rq_src, rq_dest); - raw_spin_unlock(&p->pi_lock); - return ret; -} - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. - */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; -} - -/* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). - * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. - */ -static void migrate_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next, *stop = rq->stop; - int dest_cpu; - - /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. - */ - rq->stop = NULL; - - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - - for ( ; ; ) { - /* - * There's this thread running, bail when that's the only - * remaining thread. - */ - if (rq->nr_running == 1) - break; - - next = pick_next_task(rq); - BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); - - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_cpu, next); - raw_spin_unlock(&rq->lock); - - __migrate_task(next, dead_cpu, dest_cpu); - - raw_spin_lock(&rq->lock); - } - - rq->stop = stop; -} - -#endif /* CONFIG_HOTPLUG_CPU */ - -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(13); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[12] is terminator */ - - return table; -} - -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif - -static void set_rq_online(struct rq *rq) -{ - if (!rq->online) { - const struct sched_class *class; - - cpumask_set_cpu(rq->cpu, rq->rd->online); - rq->online = 1; - - for_each_class(class) { - if (class->rq_online) - class->rq_online(rq); - } - } -} - -static void set_rq_offline(struct rq *rq) -{ - if (rq->online) { - const struct sched_class *class; - - for_each_class(class) { - if (class->rq_offline) - class->rq_offline(rq); - } - - cpumask_clear_cpu(rq->cpu, rq->rd->online); - rq->online = 0; - } -} - -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int __cpuinit -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - unsigned long flags; - struct rq *rq = cpu_rq(cpu); - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(cpu); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block __cpuinitdata migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP - -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - -#ifdef CONFIG_SCHED_DEBUG - -static __read_mostly int sched_domain_debug_enabled; - -static int __init sched_domain_debug_setup(char *str) -{ - sched_domain_debug_enabled = 1; - - return 0; -} -early_param("sched_debug", sched_domain_debug_setup); - -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - struct cpumask *groupmask) -{ - struct sched_group *group = sd->groups; - char str[256]; - - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); - cpumask_clear(groupmask); - - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); - return -1; - } - - printk(KERN_CONT "span %s level %s\n", str, sd->name); - - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); - } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); - } - - printk(KERN_DEBUG "%*s groups:", level + 1, ""); - do { - if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); - break; - } - - if (!group->sgp->power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); - break; - } - - if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); - break; - } - - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); - break; - } - - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - - printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); - } - - group = group->next; - } while (group != sd->groups); - printk(KERN_CONT "\n"); - - if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - - if (sd->parent && - !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); - return 0; -} - -static void sched_domain_debug(struct sched_domain *sd, int cpu) -{ - int level = 0; - - if (!sched_domain_debug_enabled) - return; - - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) - break; - level++; - sd = sd->parent; - if (!sd) - break; - } -} -#else /* !CONFIG_SCHED_DEBUG */ -# define sched_domain_debug(sd, cpu) do { } while (0) -#endif /* CONFIG_SCHED_DEBUG */ - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_AFFINE)) - return 0; - - return 1; -} - -static int -sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) - return 0; - - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } - if (~cflags & pflags) - return 0; - - return 1; -} - -static void free_rootdomain(struct rcu_head *rcu) -{ - struct root_domain *rd = container_of(rcu, struct root_domain, rcu); - - cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); - free_cpumask_var(rd->online); - free_cpumask_var(rd->span); - kfree(rd); -} - -static void rq_attach_root(struct rq *rq, struct root_domain *rd) -{ - struct root_domain *old_rd = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - - if (rq->rd) { - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - - /* - * If we dont want to free the old_rt yet then - * set old_rd to NULL to skip the freeing later - * in this function: - */ - if (!atomic_dec_and_test(&old_rd->refcount)) - old_rd = NULL; - } - - atomic_inc(&rd->refcount); - rq->rd = rd; - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); -} - -static int init_rootdomain(struct root_domain *rd) -{ - memset(rd, 0, sizeof(*rd)); - - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) - goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) - goto free_online; - - if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; - return 0; - -free_rto_mask: - free_cpumask_var(rd->rto_mask); -free_online: - free_cpumask_var(rd->online); -free_span: - free_cpumask_var(rd->span); -out: - return -ENOMEM; -} - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -struct root_domain def_root_domain; - -static void init_defrootdomain(void) -{ - init_rootdomain(&def_root_domain); - - atomic_set(&def_root_domain.refcount, 1); -} - -static struct root_domain *alloc_rootdomain(void) -{ - struct root_domain *rd; - - rd = kmalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return NULL; - - if (init_rootdomain(rd) != 0) { - kfree(rd); - return NULL; - } - - return rd; -} - -static void free_sched_groups(struct sched_group *sg, int free_sgp) -{ - struct sched_group *tmp, *first; - - if (!sg) - return; - - first = sg; - do { - tmp = sg->next; - - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); - - kfree(sg); - sg = tmp; - } while (sg != first); -} - -static void free_sched_domain(struct rcu_head *rcu) -{ - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - - /* - * If its an overlapping domain it has private groups, iterate and - * nuke them all. - */ - if (sd->flags & SD_OVERLAP) { - free_sched_groups(sd->groups, 1); - } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); - kfree(sd->groups); - } - kfree(sd); -} - -static void destroy_sched_domain(struct sched_domain *sd, int cpu) -{ - call_rcu(&sd->rcu, free_sched_domain); -} - -static void destroy_sched_domains(struct sched_domain *sd, int cpu) -{ - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); -} - -/* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). - * - * Also keep a unique ID per domain (we use the first cpu number in - * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). - */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); -DEFINE_PER_CPU(int, sd_llc_id); - -static void update_top_cache_domain(int cpu) -{ - struct sched_domain *sd; - int id = cpu; - - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) - id = cpumask_first(sched_domain_span(sd)); - - rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); - per_cpu(sd_llc_id, cpu) = id; -} - -/* - * Attach the domain 'sd' to 'cpu' as its base domain. Callers must - * hold the hotplug lock. - */ -static void -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct sched_domain *tmp; - - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; ) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - - if (sd_parent_degenerate(tmp, parent)) { - tmp->parent = parent->parent; - if (parent->parent) - parent->parent->child = tmp; - destroy_sched_domain(parent, cpu); - } else - tmp = tmp->parent; - } - - if (sd && sd_degenerate(sd)) { - tmp = sd; - sd = sd->parent; - destroy_sched_domain(tmp, cpu); - if (sd) - sd->child = NULL; - } - - sched_domain_debug(sd, cpu); - - rq_attach_root(rq, rd); - tmp = rq->sd; - rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); - - update_top_cache_domain(cpu); -} - -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = -1; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node != -1) - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - if (next_node < 0) - break; - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ - lockdep_assert_held(&sched_domains_mutex); - - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - - return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ - return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ - -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - -struct s_data { - struct sched_domain ** __percpu sd; - struct root_domain *rd; -}; - -enum s_alloc { - sa_rootdomain, - sa_sd, - sa_sd_storage, - sa_none, -}; - -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - struct sd_data data; -}; - -static int -build_overlap_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered = sched_domains_tmpmask; - struct sd_data *sdd = sd->private; - struct sched_domain *child; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct cpumask *sg_span; - - if (cpumask_test_cpu(i, covered)) - continue; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - - if (!sg) - goto fail; - - sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else - cpumask_set_cpu(i, sg_span); - - cpumask_or(covered, covered, sg_span); - - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); - atomic_inc(&sg->sgp->ref); - - if (cpumask_test_cpu(cpu, sg_span)) - groups = sg; - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - last->next = first; - } - sd->groups = groups; - - return 0; - -fail: - free_sched_groups(first, 0); - - return -ENOMEM; -} - -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) -{ - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - struct sched_domain *child = sd->child; - - if (child) - cpu = cpumask_first(sched_domain_span(child)); - - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ - } - - return cpu; -} - -/* - * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - * - * Assumes the sched_domain tree is fully constructed - */ -static int -build_sched_groups(struct sched_domain *sd, int cpu) -{ - struct sched_group *first = NULL, *last = NULL; - struct sd_data *sdd = sd->private; - const struct cpumask *span = sched_domain_span(sd); - struct cpumask *covered; - int i; - - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(sched_domain_span(sd))) - return 0; - - lockdep_assert_held(&sched_domains_mutex); - covered = sched_domains_tmpmask; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = get_group(i, sdd, &sg); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; - - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; - - return 0; -} - -/* - * Initialize sched groups cpu_power. - * - * cpu_power indicates the capacity of sched group, which is used while - * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. - */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) -{ - struct sched_group *sg = sd->groups; - - WARN_ON(!sd || !sg); - - do { - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); - sg = sg->next; - } while (sg != sd->groups); - - if (cpu != group_first_cpu(sg)) - return; - - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - -/* - * Initializers for schedule domains - * Non-inlined to reduce accumulated stack pressure in build_sched_domains() - */ - -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - -static int default_relax_domain_level = -1; -int sched_domain_level_max; - -static int __init setup_relax_domain_level(char *str) -{ - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; - - return 1; -} -__setup("relax_domain_level=", setup_relax_domain_level); - -static void set_domain_attribute(struct sched_domain *sd, - struct sched_domain_attr *attr) -{ - int request; - - if (!attr || attr->relax_domain_level < 0) { - if (default_relax_domain_level < 0) - return; - else - request = default_relax_domain_level; - } else - request = attr->relax_domain_level; - if (request < sd->level) { - /* turn off idle balance on this domain */ - sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } else { - /* turn on idle balance on this domain */ - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); - } -} - -static void __sdt_free(const struct cpumask *cpu_map); -static int __sdt_alloc(const struct cpumask *cpu_map); - -static void __free_domain_allocs(struct s_data *d, enum s_alloc what, - const struct cpumask *cpu_map) -{ - switch (what) { - case sa_rootdomain: - if (!atomic_read(&d->rd->refcount)) - free_rootdomain(&d->rd->rcu); /* fall through */ - case sa_sd: - free_percpu(d->sd); /* fall through */ - case sa_sd_storage: - __sdt_free(cpu_map); /* fall through */ - case sa_none: - break; - } -} - -static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, - const struct cpumask *cpu_map) -{ - memset(d, 0, sizeof(*d)); - - if (__sdt_alloc(cpu_map)) - return sa_sd_storage; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) - return sa_sd_storage; - d->rd = alloc_rootdomain(); - if (!d->rd) - return sa_sd; - return sa_rootdomain; -} - -/* - * NULL the sd_data elements we've used to build the sched_domain and - * sched_group structure so that the subsequent __free_domain_allocs() - * will not free the data we're using. - */ -static void claim_allocations(int cpu, struct sched_domain *sd) -{ - struct sd_data *sdd = sd->private; - - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; - - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; -} - -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, - { sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -static int __sdt_alloc(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for (tl = sched_domain_topology; tl->init; tl++) { - struct sd_data *sdd = &tl->data; - - sdd->sd = alloc_percpu(struct sched_domain *); - if (!sdd->sd) - return -ENOMEM; - - sdd->sg = alloc_percpu(struct sched_group *); - if (!sdd->sg) - return -ENOMEM; - - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) - return -ENOMEM; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - struct sched_group *sg; - struct sched_group_power *sgp; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return -ENOMEM; - - *per_cpu_ptr(sdd->sd, j) = sd; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sg) - return -ENOMEM; - - *per_cpu_ptr(sdd->sg, j) = sg; - - sgp = kzalloc_node(sizeof(struct sched_group_power), - GFP_KERNEL, cpu_to_node(j)); - if (!sgp) - return -ENOMEM; - - *per_cpu_ptr(sdd->sgp, j) = sgp; - } - } - - return 0; -} - -static void __sdt_free(const struct cpumask *cpu_map) -{ - struct sched_domain_topology_level *tl; - int j; - - for (tl = sched_domain_topology; tl->init; tl++) { - struct sd_data *sdd = &tl->data; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); - } - free_percpu(sdd->sd); - free_percpu(sdd->sg); - free_percpu(sdd->sgp); - } -} - -struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *child, - int cpu) -{ - struct sched_domain *sd = tl->init(tl, cpu); - if (!sd) - return child; - - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) { - sd->level = child->level + 1; - sched_domain_level_max = max(sched_domain_level_max, sd->level); - child->parent = sd; - } - sd->child = child; - - return sd; -} - -/* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus - */ -static int build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) -{ - enum s_alloc alloc_state = sa_none; - struct sched_domain *sd; - struct s_data d; - int i, ret = -ENOMEM; - - alloc_state = __visit_domain_allocation_hell(&d, cpu_map); - if (alloc_state != sa_rootdomain) - goto error; - - /* Set up domains for cpus specified by the cpu_map. */ - for_each_cpu(i, cpu_map) { - struct sched_domain_topology_level *tl; - - sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) { - sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) - sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; - } - - while (sd->child) - sd = sd->child; - - *per_cpu_ptr(d.sd, i) = sd; - } - - /* Build the groups for the domains */ - for_each_cpu(i, cpu_map) { - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { - if (build_overlap_sched_groups(sd, i)) - goto error; - } else { - if (build_sched_groups(sd, i)) - goto error; - } - } - } - - /* Calculate CPU power for physical packages and nodes */ - for (i = nr_cpumask_bits-1; i >= 0; i--) { - if (!cpumask_test_cpu(i, cpu_map)) - continue; - - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); - init_sched_groups_power(i, sd); - } - } - - /* Attach the domains */ - rcu_read_lock(); - for_each_cpu(i, cpu_map) { - sd = *per_cpu_ptr(d.sd, i); - cpu_attach_domain(sd, d.rd, i); - } - rcu_read_unlock(); - - ret = 0; -error: - __free_domain_allocs(&d, alloc_state, cpu_map); - return ret; -} - -static cpumask_var_t *doms_cur; /* current sched domains */ -static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; - /* attribues of custom domains in 'doms_cur' */ - -/* - * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask) fails, then fallback to a single sched domain, - * as determined by the single cpumask fallback_doms. - */ -static cpumask_var_t fallback_doms; - -/* - * arch_update_cpu_topology lets virtualized architectures update the - * cpu core maps. It is supposed to return 1 if the topology changed - * or 0 if it stayed the same. - */ -int __attribute__((weak)) arch_update_cpu_topology(void) -{ - return 0; -} - -cpumask_var_t *alloc_sched_domains(unsigned int ndoms) -{ - int i; - cpumask_var_t *doms; - - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); - if (!doms) - return NULL; - for (i = 0; i < ndoms; i++) { - if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { - free_sched_domains(doms, i); - return NULL; - } - } - return doms; -} - -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) -{ - unsigned int i; - for (i = 0; i < ndoms; i++) - free_cpumask_var(doms[i]); - kfree(doms); -} - -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ -static int init_sched_domains(const struct cpumask *cpu_map) -{ - int err; - - arch_update_cpu_topology(); - ndoms_cur = 1; - doms_cur = alloc_sched_domains(ndoms_cur); - if (!doms_cur) - doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; - err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); - - return err; -} - -/* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain - */ -static void detach_destroy_domains(const struct cpumask *cpu_map) -{ - int i; - - rcu_read_lock(); - for_each_cpu(i, cpu_map) - cpu_attach_domain(NULL, &def_root_domain, i); - rcu_read_unlock(); -} - -/* handle null as "default" */ -static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, - struct sched_domain_attr *new, int idx_new) -{ - struct sched_domain_attr tmp; - - /* fast path */ - if (!new && !cur) - return 1; - - tmp = SD_ATTR_INIT; - return !memcmp(cur ? (cur + idx_cur) : &tmp, - new ? (new + idx_new) : &tmp, - sizeof(struct sched_domain_attr)); -} - -/* - * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares - * doms_new[] to the current sched domain partitioning, doms_cur[]. - * It destroys each deleted domain and builds each new domain. - * - * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the - * current 'doms_cur' domains and in the new 'doms_new', we can leave - * it as it is. - * - * The passed in 'doms_new' should be allocated using - * alloc_sched_domains. This routine takes ownership of it and will - * free_sched_domains it when done with it. If the caller failed the - * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. - * - * If doms_new == NULL it will be replaced with cpu_online_mask. - * ndoms_new == 0 is a special case for destroying existing domains, - * and it will not create the default domain. - * - * Call with hotplug lock held - */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - int i, j, n; - int new_topology; - - mutex_lock(&sched_domains_mutex); - - /* always unregister in case we don't destroy any domains */ - unregister_sched_domain_sysctl(); - - /* Let architecture update cpu core mappings. */ - new_topology = arch_update_cpu_topology(); - - n = doms_new ? ndoms_new : 0; - - /* Destroy deleted domains */ - for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) - goto match1; - } - /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur[i]); -match1: - ; - } - - if (doms_new == NULL) { - ndoms_cur = 0; - doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); - WARN_ON_ONCE(dattr_new); - } - - /* Build new domains */ - for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) - goto match2; - } - /* no match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); -match2: - ; - } - - /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) - free_sched_domains(doms_cur, ndoms_cur); - kfree(dattr_cur); /* kfree(NULL) is safe */ - doms_cur = doms_new; - dattr_cur = dattr_new; - ndoms_cur = ndoms_new; - - register_sched_domain_sysctl(); - - mutex_unlock(&sched_domains_mutex); -} - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - -/* - * Update cpusets according to cpu_active mask. If cpusets are - * disabled, cpuset_update_active_cpus() becomes a simple wrapper - * around partition_sched_domains(). - */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -void __init sched_init_smp(void) -{ - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); - mutex_unlock(&sched_domains_mutex); - put_online_cpus(); - - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - - init_hrtick(); - - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) - BUG(); - sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); - - init_sched_rt_class(); -} -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - -const_debug unsigned int sysctl_timer_migration = 1; - -int in_sched_functions(unsigned long addr) -{ - return in_lock_functions(addr) || - (addr >= (unsigned long)__sched_text_start - && addr < (unsigned long)__sched_text_end); -} - -#ifdef CONFIG_CGROUP_SCHED -struct task_group root_task_group; -#endif - -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); - -void __init sched_init(void) -{ - int i, j; - unsigned long alloc_size = 0, ptr; - -#ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); - -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ - } - -#ifdef CONFIG_SMP - init_defrootdomain(); -#endif - - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - -#ifdef CONFIG_RT_GROUP_SCHED - init_rt_bandwidth(&root_task_group.rt_bandwidth, - global_rt_period(), global_rt_runtime()); -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_CGROUP_SCHED - list_add(&root_task_group.list, &task_groups); - INIT_LIST_HEAD(&root_task_group.children); - INIT_LIST_HEAD(&root_task_group.siblings); - autogroup_init(&init_task); - -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif - for_each_possible_cpu(i) { - struct rq *rq; - - rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); - rq->nr_running = 0; - rq->calc_load_active = 0; - rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - /* - * How much cpu bandwidth does root_task_group get? - * - * In case of task-groups formed thr' the cgroup filesystem, it - * gets 100% of the cpu resources in the system. This overall - * system cpu resource is divided among the tasks of - * root_task_group and its child task-groups in a fair manner, - * based on each entity's (task or task-group's) weight - * (se->load.weight). - * - * In other words, if root_task_group has 10 tasks of weight - * 1024) and two child groups A0 and A1 (of weight 1024 each), - * then A0's share of the cpu resource is: - * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% - * - * We achieve this by letting root_task_group's tasks sit - * directly in rq->cfs (i.e root_task_group->se[] = NULL). - */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); -#endif /* CONFIG_FAIR_GROUP_SCHED */ - - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; -#ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); -#endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - - rq->last_load_update_tick = jiffies; - -#ifdef CONFIG_SMP - rq->sd = NULL; - rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; - rq->post_schedule = 0; - rq->active_balance = 0; - rq->next_balance = jiffies; - rq->push_cpu = 0; - rq->cpu = i; - rq->online = 0; - rq->idle_stamp = 0; - rq->avg_idle = 2*sysctl_sched_migration_cost; - rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ - rq->nohz_flags = 0; -#endif -#endif - init_rq_hrtick(rq); - atomic_set(&rq->nr_iowait, 0); - } - - set_load_weight(&init_task); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current); - - /* - * Make us the idle thread. Technically, schedule() should not be - * called from this thread, however somewhere below it might be, - * but because we are the idle thread, we just pick up running again - * when this runqueue becomes "idle". - */ - init_idle(current, smp_processor_id()); - - calc_load_update = jiffies + LOAD_FREQ; - - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - -#ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -#endif - init_sched_fair_class(); - - scheduler_running = 1; -} - -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); - - return (nested == preempt_offset); -} - -void __might_sleep(const char *file, int line, int preempt_offset) -{ - static unsigned long prev_jiffy; /* ratelimiting */ - - rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) - return; - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); - - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); -} -EXPORT_SYMBOL(__might_sleep); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) -{ - const struct sched_class *prev_class = p->sched_class; - int old_prio = p->prio; - int on_rq; - - on_rq = p->on_rq; - if (on_rq) - dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - enqueue_task(rq, p, 0); - resched_task(rq->curr); - } - - check_class_changed(rq, p, prev_class, old_prio); -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; - - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { - /* - * Only normalize user tasks: - */ - if (!p->mm) - continue; - - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif - - if (!rt_task(p)) { - /* - * Renice negative nice level userspace - * tasks back to 0: - */ - if (TASK_NICE(p) < 0 && p->mm) - set_user_nice(p, 0); - continue; - } - - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - - normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); -} - -#endif /* CONFIG_MAGIC_SYSRQ */ - -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -/* - * These functions are only useful for the IA64 MCA handling, or kdb. - * - * They can only be called when the whole system has been - * stopped - every CPU needs to be quiescent, and no scheduling - * activity can take place. Using them for anything else would - * be a serious bug, and as a result, they aren't even visible - * under any other configuration. - */ - -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -struct task_struct *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif - -#ifdef CONFIG_CGROUP_SCHED -/* task_group_lock serializes the addition/removal of task groups */ -static DEFINE_SPINLOCK(task_group_lock); - -static void free_sched_group(struct task_group *tg) -{ - free_fair_sched_group(tg); - free_rt_sched_group(tg); - autogroup_free(tg); - kfree(tg); -} - -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(struct task_group *parent) -{ - struct task_group *tg; - unsigned long flags; - - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - - if (!alloc_fair_sched_group(tg, parent)) - goto err; - - if (!alloc_rt_sched_group(tg, parent)) - goto err; - - spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); - - WARN_ON(!parent); /* root should already exist */ - - tg->parent = parent; - INIT_LIST_HEAD(&tg->children); - list_add_rcu(&tg->siblings, &parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); -} - -/* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) -{ - /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); -} - -/* Destroy runqueue etc associated with a task group */ -void sched_destroy_group(struct task_group *tg) -{ - unsigned long flags; - int i; - - /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - - spin_lock_irqsave(&task_group_lock, flags); - list_del_rcu(&tg->list); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); -} - -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) -{ - int on_rq, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - on_rq = tsk->on_rq; - - if (on_rq) - dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); - else -#endif - set_task_rq(tsk, task_cpu(tsk)); - - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, tsk, 0); - - task_rq_unlock(rq, tsk, &flags); -} -#endif /* CONFIG_CGROUP_SCHED */ - -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) - return 1; - } while_each_thread(g, p); - - return 0; -} - -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_rt_schedulable(struct task_group *tg, void *data) -{ - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; - - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; - - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; - - total = to_ratio(period, runtime); - - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; - - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; - - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - sum += to_ratio(period, runtime); - } - - if (sum > total) - return -EINVAL; - - return 0; -} - -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - int ret; - - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - rcu_read_lock(); - ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); - rcu_read_unlock(); - - return ret; -} - -static int tg_set_rt_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) - goto unlock; - - raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); -unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = (u64)rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - if (rt_period == 0) - return -EINVAL; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} - -static int sched_rt_global_constraints(void) -{ - u64 runtime, period; - int ret = 0; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} - -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } - } - mutex_unlock(&mutex); - - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED - -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); -} - -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg, *parent; - - if (!cgrp->parent) { - /* This is early initialization for the top cgroup */ - return &root_task_group.css; - } - - parent = cgroup_tg(cgrp->parent); - tg = sched_create_group(parent); - if (IS_ERR(tg)) - return ERR_PTR(-ENOMEM); - - return &tg->css; -} - -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct task_group *tg = cgroup_tg(cgrp); - - sched_destroy_group(tg); -} - -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) -{ - struct task_struct *task; - - cgroup_taskset_for_each(task, cgrp, tset) { -#ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) - return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; -#endif - } - return 0; -} - -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) -{ - struct task_struct *task; - - cgroup_taskset_for_each(task, cgrp, tset) - sched_move_task(task); -} - -static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) -{ - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); -} - -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) -{ - struct task_group *tg = cgroup_tg(cgrp); - - return (u64) scale_load_down(tg->shares); -} - -#ifdef CONFIG_CFS_BANDWIDTH -static DEFINE_MUTEX(cfs_constraints_mutex); - -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ - -static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); - -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) -{ - int i, ret = 0, runtime_enabled, runtime_was_enabled; - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - - if (tg == &root_task_group) - return -EINVAL; - - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; - - /* - * Likewise, bound things on the otherside by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; - - mutex_lock(&cfs_constraints_mutex); - ret = __cfs_schedulable(tg, period, quota); - if (ret) - goto out_unlock; - - runtime_enabled = quota != RUNTIME_INF; - runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - - __refill_cfs_bandwidth_runtime(cfs_b); - /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); - } - raw_spin_unlock_irq(&cfs_b->lock); - - for_each_possible_cpu(i) { - struct cfs_rq *cfs_rq = tg->cfs_rq[i]; - struct rq *rq = cfs_rq->rq; - - raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; - - if (cfs_rq->throttled) - unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); - } -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - - return ret; -} - -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) -{ - u64 quota, period; - - period = ktime_to_ns(tg->cfs_bandwidth.period); - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - - return tg_set_cfs_bandwidth(tg, period, quota); -} - -long tg_get_cfs_quota(struct task_group *tg) -{ - u64 quota_us; - - if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; - - quota_us = tg->cfs_bandwidth.quota; - do_div(quota_us, NSEC_PER_USEC); - - return quota_us; -} - -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota); -} - -long tg_get_cfs_period(struct task_group *tg) -{ - u64 cfs_period_us; - - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); - - return cfs_period_us; -} - -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) -{ - return tg_get_cfs_quota(cgroup_tg(cgrp)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) -{ - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) -{ - return tg_get_cfs_period(cgroup_tg(cgrp)); -} - -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) -{ - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); -} - -struct cfs_schedulable_data { - struct task_group *tg; - u64 period, quota; -}; - -/* - * normalize group quota/period to be quota/max_period - * note: units are usecs - */ -static u64 normalize_cfs_quota(struct task_group *tg, - struct cfs_schedulable_data *d) -{ - u64 quota, period; - - if (tg == d->tg) { - period = d->period; - quota = d->quota; - } else { - period = tg_get_cfs_period(tg); - quota = tg_get_cfs_quota(tg); - } - - /* note: these should typically be equivalent */ - if (quota == RUNTIME_INF || quota == -1) - return RUNTIME_INF; - - return to_ratio(period, quota); -} - -static int tg_cfs_schedulable_down(struct task_group *tg, void *data) -{ - struct cfs_schedulable_data *d = data; - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - s64 quota = 0, parent_quota = -1; - - if (!tg->parent) { - quota = RUNTIME_INF; - } else { - struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; - - quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; - - /* - * ensure max(child_quota) <= parent_quota, inherit when no - * limit is set - */ - if (quota == RUNTIME_INF) - quota = parent_quota; - else if (parent_quota != RUNTIME_INF && quota > parent_quota) - return -EINVAL; - } - cfs_b->hierarchal_quota = quota; - - return 0; -} - -static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) -{ - int ret; - struct cfs_schedulable_data data = { - .tg = tg, - .period = period, - .quota = quota, - }; - - if (quota != RUNTIME_INF) { - do_div(data.period, NSEC_PER_USEC); - do_div(data.quota, NSEC_PER_USEC); - } - - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; -} - -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct task_group *tg = cgroup_tg(cgrp); - struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); - - return 0; -} -#endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) -{ - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); -} - -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_runtime(cgroup_tg(cgrp)); -} - -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) -{ - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); -} - -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) -{ - return sched_group_rt_period(cgroup_tg(cgrp)); -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", - .read_u64 = cpu_shares_read_u64, - .write_u64 = cpu_shares_write_u64, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, - }, - { - .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, - }, - { - .name = "stat", - .read_map = cpu_stats_show, - }, -#endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif -}; - -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, - .populate = cpu_cgroup_populate, - .subsys_id = cpu_cgroup_subsys_id, - .early_init = 1, -}; - -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, -}; - -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, - .populate = cpuacct_populate, - .subsys_id = cpuacct_subsys_id, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ -/* - * kernel/sched/cpupri.c - * - * CPU priority management - * - * Copyright (C) 2007-2008 Novell - * - * Author: Gregory Haskins - * - * This code tracks the priority of each CPU so that global migration - * decisions are easy to calculate. Each CPU can be in a state as follows: - * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 - * - * going from the lowest priority to the highest. CPUs in the INVALID state - * are not eligible for routing. The system maintains this state with - * a 2 dimensional bitmap (the first for priority class, the second for cpus - * in that class). Therefore a typical application without affinity - * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit - * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that - * yields the worst case search is fairly contrived. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include "cpupri.h" - -/* Convert between a 140 based task->prio, and our 102 based cpupri */ -static int convert_prio(int prio) -{ - int cpupri; - - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; - - return cpupri; -} - -/** - * cpupri_find - find the best (lowest-pri) CPU in the system - * @cp: The cpupri context - * @p: The task - * @lowest_mask: A mask to fill in with selected CPUs (or NULL) - * - * Note: This function returns the recommended CPUs as calculated during the - * current invocation. By the time the call returns, the CPUs may have in - * fact changed priorities any number of times. While not ideal, it is not - * an issue of correctness since the normal rebalancer logic will correct - * any discrepancies created by racing against the uncertainty of the current - * priority configuration. - * - * Returns: (int)bool - CPUs were found - */ -int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) -{ - int idx = 0; - int task_pri = convert_prio(p->prio); - - if (task_pri >= MAX_RT_PRIO) - return 0; - - for (idx = 0; idx < task_pri; idx++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; - int skip = 0; - - if (!atomic_read(&(vec)->count)) - skip = 1; - /* - * When looking at the vector, we need to read the counter, - * do a memory barrier, then read the mask. - * - * Note: This is still all racey, but we can deal with it. - * Ideally, we only want to look at masks that are set. - * - * If a mask is not set, then the only thing wrong is that we - * did a little more work than necessary. - * - * If we read a zero count but the mask is set, because of the - * memory barriers, that can only happen when the highest prio - * task for a run queue has left the run queue, in which case, - * it will be followed by a pull. If the task we are processing - * fails to find a proper place to go, that pull request will - * pull this task if the run queue is running at a lower - * priority. - */ - smp_rmb(); - - /* Need to do the rmb for every iteration */ - if (skip) - continue; - - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) - continue; - - if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); - - /* - * We have to ensure that we have at least one bit - * still set in the array, since the map could have - * been concurrently emptied between the first and - * second reads of vec->mask. If we hit this - * condition, simply act as though we never hit this - * priority level and continue on. - */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) - continue; - } - - return 1; - } - - return 0; -} - -/** - * cpupri_set - update the cpu priority setting - * @cp: The cpupri context - * @cpu: The target cpu - * @newpri: The priority (INVALID-RT99) to assign to this CPU - * - * Note: Assumes cpu_rq(cpu)->lock is locked - * - * Returns: (void) - */ -void cpupri_set(struct cpupri *cp, int cpu, int newpri) -{ - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; - - newpri = convert_prio(newpri); - - BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); - - if (newpri == oldpri) - return; - - /* - * If the cpu was currently mapped to a different value, we - * need to map it to the new value then remove the old value. - * Note, we must add the new value first, otherwise we risk the - * cpu being missed by the priority loop in cpupri_find. - */ - if (likely(newpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - - cpumask_set_cpu(cpu, vec->mask); - /* - * When adding a new vector, we update the mask first, - * do a write memory barrier, and then update the count, to - * make sure the vector is visible when count is set. - */ - smp_mb__before_atomic_inc(); - atomic_inc(&(vec)->count); - do_mb = 1; - } - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - /* - * Because the order of modification of the vec->count - * is important, we must make sure that the update - * of the new prio is seen before we decrement the - * old prio. This makes sure that the loop sees - * one or the other when we raise the priority of - * the run queue. We don't care about when we lower the - * priority, as that will trigger an rt pull anyway. - * - * We only need to do a memory barrier if we updated - * the new priority vec. - */ - if (do_mb) - smp_mb__after_atomic_inc(); - - /* - * When removing from the vector, we decrement the counter first - * do a memory barrier and then clear the mask. - */ - atomic_dec(&(vec)->count); - smp_mb__after_atomic_inc(); - cpumask_clear_cpu(cpu, vec->mask); - } - - *currpri = newpri; -} - -/** - * cpupri_init - initialize the cpupri structure - * @cp: The cpupri context - * - * Returns: -ENOMEM if memory fails. - */ -int cpupri_init(struct cpupri *cp) -{ - int i; - - memset(cp, 0, sizeof(*cp)); - - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { - struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - - atomic_set(&vec->count, 0); - if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) - goto cleanup; - } - - for_each_possible_cpu(i) - cp->cpu_to_pri[i] = CPUPRI_INVALID; - return 0; - -cleanup: - for (i--; i >= 0; i--) - free_cpumask_var(cp->pri_to_cpu[i].mask); - return -ENOMEM; -} - -/** - * cpupri_cleanup - clean up the cpupri structure - * @cp: The cpupri context - */ -void cpupri_cleanup(struct cpupri *cp) -{ - int i; - - for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) - free_cpumask_var(cp->pri_to_cpu[i].mask); -} -/* - * kernel/sched/debug.c - * - * Print the CFS rbtree - * - * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include - -#include "sched.h" - -static DEFINE_SPINLOCK(sched_debug_lock); - -/* - * This allows printing both to /proc/sched_debug and - * to the console - */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) - -/* - * Ease the printing of nsec fields: - */ -static long long nsec_high(unsigned long long nsec) -{ - if ((long long)nsec < 0) { - nsec = -nsec; - do_div(nsec, 1000000); - return -nsec; - } - do_div(nsec, 1000000); - - return nsec; -} - -static unsigned long nsec_low(unsigned long long nsec) -{ - if ((long long)nsec < 0) - nsec = -nsec; - - return do_div(nsec, 1000000); -} - -#define SPLIT_NS(x) nsec_high(x), nsec_low(x) - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) -{ - struct sched_entity *se = tg->se[cpu]; - if (!se) - return; - -#define P(F) \ - SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define PN(F) \ - SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - - PN(se->exec_start); - PN(se->vruntime); - PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); -#endif - P(se->load.weight); -#undef PN -#undef P -} -#endif - -#ifdef CONFIG_CGROUP_SCHED -static char group_path[PATH_MAX]; - -static char *task_group_path(struct task_group *tg) -{ - if (autogroup_path(tg, group_path, PATH_MAX)) - return group_path; - - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; -} -#endif - -static void -print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) -{ - if (rq->curr == p) - SEQ_printf(m, "R"); - else - SEQ_printf(m, " "); - - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, - SPLIT_NS(p->se.vruntime), - (long long)(p->nvcsw + p->nivcsw), - p->prio); -#ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.vruntime), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); -#else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", - 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); -#endif -#ifdef CONFIG_CGROUP_SCHED - SEQ_printf(m, " %s", task_group_path(task_group(p))); -#endif - - SEQ_printf(m, "\n"); -} - -static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) -{ - struct task_struct *g, *p; - unsigned long flags; - - SEQ_printf(m, - "\nrunnable tasks:\n" - " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" - "------------------------------------------------------" - "----------------------------------------------------\n"); - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { - if (!p->on_rq || task_cpu(p) != rq_cpu) - continue; - - print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); -} - -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) -{ - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; - struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; - unsigned long flags; - -#ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); -#else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); - - raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; - last = __pick_last_entity(cfs_rq); - if (last) - max_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); -#endif - - print_cfs_group_stats(m, cpu, cfs_rq->tg); -#endif -} - -void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) -{ -#ifdef CONFIG_RT_GROUP_SCHED - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); -#else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); -#endif - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) -#define PN(x) \ - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) - - P(rt_nr_running); - P(rt_throttled); - PN(rt_time); - PN(rt_runtime); - -#undef PN -#undef P -} - -extern __read_mostly int sched_clock_running; - -static void print_cpu(struct seq_file *m, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - -#ifdef CONFIG_X86 - { - unsigned int freq = cpu_khz ? : 1; - - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", - cpu, freq / 1000, (freq % 1000)); - } -#else - SEQ_printf(m, "\ncpu#%d\n", cpu); -#endif - -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) -#define PN(x) \ - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) - - P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->load.weight); - P(nr_switches); - P(nr_load_updates); - P(nr_uninterruptible); - PN(next_balance); - P(curr->pid); - PN(clock); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); -#undef P -#undef PN - -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - - P(yld_count); - - P(sched_switch); - P(sched_count); - P(sched_goidle); -#ifdef CONFIG_SMP - P64(avg_idle); -#endif - - P(ttwu_count); - P(ttwu_local); - -#undef P -#undef P64 -#endif - spin_lock_irqsave(&sched_debug_lock, flags); - print_cfs_stats(m, cpu); - print_rt_stats(m, cpu); - - rcu_read_lock(); - print_rq(m, rq, cpu); - rcu_read_unlock(); - spin_unlock_irqrestore(&sched_debug_lock, flags); -} - -static const char *sched_tunable_scaling_names[] = { - "none", - "logaritmic", - "linear" -}; - -static int sched_debug_show(struct seq_file *m, void *v) -{ - u64 ktime, sched_clk, cpu_clk; - unsigned long flags; - int cpu; - - local_irq_save(flags); - ktime = ktime_to_ns(ktime_get()); - sched_clk = sched_clock(); - cpu_clk = local_clock(); - local_irq_restore(flags); - - SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - -#define P(x) \ - SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) -#define PN(x) \ - SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(ktime); - PN(sched_clk); - PN(cpu_clk); - P(jiffies); -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); -#endif -#undef PN -#undef P - - SEQ_printf(m, "\n"); - SEQ_printf(m, "sysctl_sched\n"); - -#define P(x) \ - SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) -#define PN(x) \ - SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); - PN(sysctl_sched_wakeup_granularity); - P(sysctl_sched_child_runs_first); - P(sysctl_sched_features); -#undef PN -#undef P - - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", - sysctl_sched_tunable_scaling, - sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); - - for_each_online_cpu(cpu) - print_cpu(m, cpu); - - SEQ_printf(m, "\n"); - - return 0; -} - -void sysrq_sched_debug_show(void) -{ - sched_debug_show(NULL, NULL); -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_debug_show, NULL); -} - -static const struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_sched_debug_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); - if (!pe) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - -void proc_sched_show_task(struct task_struct *p, struct seq_file *m) -{ - unsigned long nr_switches; - - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, - get_nr_threads(p)); - SEQ_printf(m, - "---------------------------------------------------------\n"); -#define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) -#define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) -#define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) - - PN(se.exec_start); - PN(se.vruntime); - PN(se.sum_exec_runtime); - - nr_switches = p->nvcsw + p->nivcsw; - -#ifdef CONFIG_SCHEDSTATS - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - - { - u64 avg_atom, avg_per_cpu; - - avg_atom = p->se.sum_exec_runtime; - if (nr_switches) - do_div(avg_atom, nr_switches); - else - avg_atom = -1LL; - - avg_per_cpu = p->se.sum_exec_runtime; - if (p->se.nr_migrations) { - avg_per_cpu = div64_u64(avg_per_cpu, - p->se.nr_migrations); - } else { - avg_per_cpu = -1LL; - } - - __PN(avg_atom); - __PN(avg_per_cpu); - } -#endif - __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", - "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", - "nr_involuntary_switches", (long long)p->nivcsw); - - P(se.load.weight); - P(policy); - P(prio); -#undef PN -#undef __PN -#undef P -#undef __P - - { - unsigned int this_cpu = raw_smp_processor_id(); - u64 t0, t1; - - t0 = cpu_clock(this_cpu); - t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-35s:%21Ld\n", - "clock-delta", (long long)(t1-t0)); - } -} - -void proc_sched_set_task(struct task_struct *p) -{ -#ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); -#endif -} -/* - * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) - * - * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar - * - * Interactivity improvements by Mike Galbraith - * (C) 2007 Mike Galbraith - * - * Various enhancements by Dmitry Adamushko. - * (C) 2007 Dmitry Adamushko - * - * Group scheduling enhancements by Srivatsa Vaddagiri - * Copyright IBM Corporation, 2007 - * Author: Srivatsa Vaddagiri - * - * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner - * - * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include "sched.h" - -/* - * Targeted preemption latency for CPU-bound tasks: - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; - -/* - * The initial- and re-scaling of tunables is configurable - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) - * - * Options are: - * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus - */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling - = SCHED_TUNABLESCALING_LOG; - -/* - * Minimal preemption granularity for CPU-bound tasks: - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; - -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - -/* - * SCHED_OTHER wake-up granularity. - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; - -/* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) - */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; - -#ifdef CONFIG_CFS_BANDWIDTH -/* - * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool - * each time a cfs_rq requests quota. - * - * Note: in the case that the slice exceeds the runtime remaining (either due - * to consumption or the quota being specified to be smaller than the slice) - * we will always only issue the remaining available time. - * - * default: 5 msec, units: microseconds - */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; -#endif - -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -void sched_init_granularity(void) -{ - update_sysctl(); -} - -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - - -const struct sched_class fair_sched_class; - -/************************************************************** - * CFS operations on generic schedulable entities: - */ - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - -static inline struct task_struct *task_of(struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!entity_is_task(se)); -#endif - return container_of(se, struct task_struct, se); -} - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (!cfs_rq->on_list) { - /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. - */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } - - cfs_rq->on_list = 1; - } -} - -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->on_list) { - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - cfs_rq->on_list = 0; - } -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return 1; - - return 0; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -static void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ - int se_depth, pse_depth; - - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); - - while (se_depth > pse_depth) { - se_depth--; - *se = parent_entity(*se); - } - - while (pse_depth > se_depth) { - pse_depth--; - *pse = parent_entity(*pse); - } - - while (!is_same_group(*se, *pse)) { - *se = parent_entity(*se); - *pse = parent_entity(*pse); - } -} - -#else /* !CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - -#define entity_is_task(se) 1 - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ -} - -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return NULL; -} - -static inline void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); - -/************************************************************** - * Scheduling class tree data structure manipulation methods: - */ - -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) - min_vruntime = vruntime; - - return min_vruntime; -} - -static inline int entity_before(struct sched_entity *a, - struct sched_entity *b) -{ - return (s64)(a->vruntime - b->vruntime) < 0; -} - -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = cfs_rq->min_vruntime; - - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; - - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); - - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); - } - - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -/* - * Enqueue an entity into the rb-tree: - */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - int leftmost = 1; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = 0; - } - } - - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); -} - -static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); -} - -struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) -{ - struct rb_node *left = cfs_rq->rb_leftmost; - - if (!left) - return NULL; - - return rb_entry(left, struct sched_entity, run_node); -} - -static struct sched_entity *__pick_next_entity(struct sched_entity *se) -{ - struct rb_node *next = rb_next(&se->run_node); - - if (!next) - return NULL; - - return rb_entry(next, struct sched_entity, run_node); -} - -#ifdef CONFIG_SCHED_DEBUG -struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) -{ - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - - if (!last) - return NULL; - - return rb_entry(last, struct sched_entity, run_node); -} - -/************************************************************** - * Scheduling class statistics methods: - */ - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int factor = get_update_sysctl_factor(); - - if (ret || !write) - return ret; - - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - -#define WRT_SYSCTL(name) \ - (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); -#undef WRT_SYSCTL - - return 0; -} -#endif - -/* - * delta /= w - */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; -} - -/* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] - */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; - - if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; - - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = calc_delta_mine(slice, se->load.weight, load); - } - return slice; -} - -/* - * We calculate the vruntime slice of a to be inserted task - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif -} - -static void update_curr(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; - unsigned long delta_exec; - - if (unlikely(!curr)) - return; - - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) - return; - - __update_curr(cfs_rq, curr, delta_exec); - curr->exec_start = now; - - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } - - account_cfs_rq_runtime(cfs_rq, delta_exec); -} - -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); -} - -/* - * Task is being enqueued - update stats: - */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Are we enqueueing a waiting task? (for current tasks - * a dequeue/enqueue event is a NOP) - */ - if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); -} - -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * Mark the end of the wait period if dequeueing a - * waiting task: - */ - if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); -} - -/* - * We are picking a new current task - update its stats: - */ -static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* - * We are starting a new run period: - */ - se->exec_start = rq_of(cfs_rq)->clock_task; -} - -/************************************************** - * Scheduling class queueing methods: - */ - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - -static void -account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } - cfs_rq->nr_running++; -} - -static void -account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); - list_del_init(&se->group_node); - } - cfs_rq->nr_running--; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -# ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) -{ - long tg_weight; - - /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). - */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} - -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; - - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; - - shares = (tg->shares * load); - if (tg_weight) - shares /= tg_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; - - return shares; -} - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} -# else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - return tg->shares; -} - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} -# endif /* CONFIG_SMP */ -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) -{ - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - } - - update_load_set(&se->load, weight); - - if (se->on_rq) - account_entity_enqueue(cfs_rq, se); -} - -static void update_cfs_shares(struct cfs_rq *cfs_rq) -{ - struct task_group *tg; - struct sched_entity *se; - long shares; - - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) - return; -#ifndef CONFIG_SMP - if (likely(se->load.weight == tg->shares)) - return; -#endif - shares = calc_cfs_shares(cfs_rq, tg); - - reweight_entity(cfs_rq_of(se), se, shares); -} -#else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) -{ -} - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); -#endif -} - -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); - - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh = sysctl_sched_latency; - - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; - } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; -} - -static void check_enqueue_throttle(struct cfs_rq *cfs_rq); - -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - /* - * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). - */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) - se->vruntime += cfs_rq->min_vruntime; - - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); - account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); - - if (flags & ENQUEUE_WAKEUP) { - place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); - } - - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); - se->on_rq = 1; - - if (cfs_rq->nr_running == 1) { - list_add_leaf_cfs_rq(cfs_rq); - check_enqueue_throttle(cfs_rq); - } -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else - break; - } -} - -static void __clear_buddies_next(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else - break; - } -} - -static void __clear_buddies_skip(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else - break; - } -} - -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->last == se) - __clear_buddies_last(se); - - if (cfs_rq->next == se) - __clear_buddies_next(se); - - if (cfs_rq->skip == se) - __clear_buddies_skip(se); -} - -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); - -static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; - } -#endif - } - - clear_buddies(cfs_rq, se); - - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->on_rq = 0; - update_cfs_load(cfs_rq, 0); - account_entity_dequeue(cfs_rq, se); - - /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - - /* return excess runtime on last dequeue */ - return_cfs_rq_runtime(cfs_rq); - - update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; - - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); -} - -static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - /* 'current' is not kept within the tree. */ - if (se->on_rq) { - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. - */ - update_stats_wait_end(cfs_rq, se); - __dequeue_entity(cfs_rq, se); - } - - update_stats_curr_start(cfs_rq, se); - cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS - /* - * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it - * when there are only lesser-weight tasks around): - */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); - } -#endif - se->prev_sum_exec_runtime = se->sum_exec_runtime; -} - -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - -/* - * Pick the next process, keeping these things in mind, in this order: - * 1) keep things fair between processes/task groups - * 2) pick the "next" process, since someone really wants that to run - * 3) pick the "last" process, for cache locality - * 4) do not run the "skip" process, if something else is available - */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. - */ - if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } - - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; - - clear_buddies(cfs_rq, se); - - return se; -} - -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); - -static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) -{ - /* - * If still on the runqueue then deactivate_task() - * was not called and update_curr() has to be done: - */ - if (prev->on_rq) - update_curr(cfs_rq); - - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - - check_spread(cfs_rq, prev); - if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); - /* Put 'current' back into the tree. */ - __enqueue_entity(cfs_rq, prev); - } - cfs_rq->curr = NULL; -} - -static void -entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) -{ - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - /* - * Update share accounting for long-running entities. - */ - update_entity_shares_tick(cfs_rq); - -#ifdef CONFIG_SCHED_HRTICK - /* - * queued ticks are scheduled to match the slice, so don't bother - * validating it and just reschedule. - */ - if (queued) { - resched_task(rq_of(cfs_rq)->curr); - return; - } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; -#endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); -} - - -/************************************************** - * CFS bandwidth control machinery - */ - -#ifdef CONFIG_CFS_BANDWIDTH - -#ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; - -static inline bool cfs_bandwidth_used(void) -{ - return static_branch(&__cfs_bandwidth_used); -} - -void account_cfs_bandwidth_used(int enabled, int was_enabled) -{ - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); -} -#else /* HAVE_JUMP_LABEL */ -static bool cfs_bandwidth_used(void) -{ - return true; -} - -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} -#endif /* HAVE_JUMP_LABEL */ - -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} - -static inline u64 sched_cfs_bandwidth_slice(void) -{ - return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; -} - -/* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. - * - * requires cfs_b->lock - */ -void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) -{ - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); -} - -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return &tg->cfs_bandwidth; -} - -/* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - - /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; - - raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota == RUNTIME_INF) - amount = min_amount; - else { - /* - * If the bandwidth pool has become inactive, then at least one - * period must have elapsed since the last consumption. - * Refresh the global state and ensure bandwidth timer becomes - * active. - */ - if (!cfs_b->timer_active) { - __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); - } - - if (cfs_b->runtime > 0) { - amount = min(cfs_b->runtime, min_amount); - cfs_b->runtime -= amount; - cfs_b->idle = 0; - } - } - expires = cfs_b->runtime_expires; - raw_spin_unlock(&cfs_b->lock); - - cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) - cfs_rq->runtime_expires = expires; - - return cfs_rq->runtime_remaining > 0; -} - -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. - */ - - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) -{ - /* dock delta_exec before expiring quota (as it could span periods) */ - cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); - - if (likely(cfs_rq->runtime_remaining > 0)) - return; - - /* - * if we're unable to extend our runtime we resched so that the active - * hierarchy can be throttled - */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); -} - -static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) -{ - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) - return; - - __account_cfs_rq_runtime(cfs_rq, delta_exec); -} - -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{ - return cfs_bandwidth_used() && cfs_rq->throttled; -} - -/* check whether cfs_rq, or any parent, is throttled */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) -{ - return cfs_bandwidth_used() && cfs_rq->throttle_count; -} - -/* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. - */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; - - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; - - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); -} - -/* updated child weight may affect parent so we have to do this bottom up */ -static int tg_unthrottle_up(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - cfs_rq->throttle_count--; -#ifdef CONFIG_SMP - if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); - } -#endif - - return 0; -} - -static int tg_throttle_down(struct task_group *tg, void *data) -{ - struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - - /* group is entering throttled state, record last load */ - if (!cfs_rq->throttle_count) - update_cfs_load(cfs_rq, 0); - cfs_rq->throttle_count++; - - return 0; -} - -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long task_delta, dequeue = 1; - - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - - /* account load preceding throttle */ - rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); - rcu_read_unlock(); - - task_delta = cfs_rq->h_nr_running; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - break; - - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - qcfs_rq->h_nr_running -= task_delta; - - if (qcfs_rq->load.weight) - dequeue = 0; - } - - if (!se) - rq->nr_running -= task_delta; - - cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; - raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - raw_spin_unlock(&cfs_b->lock); -} - -void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - int enqueue = 1; - long task_delta; - - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - - cfs_rq->throttled = 0; - raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; - list_del_rcu(&cfs_rq->throttled_list); - raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; - - update_rq_clock(rq); - /* update hierarchical throttle state */ - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - - if (!cfs_rq->load.weight) - return; - - task_delta = cfs_rq->h_nr_running; - for_each_sched_entity(se) { - if (se->on_rq) - enqueue = 0; - - cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); - cfs_rq->h_nr_running += task_delta; - - if (cfs_rq_throttled(cfs_rq)) - break; - } - - if (!se) - rq->nr_running += task_delta; - - /* determine whether we need to wake up potentially idle cpu */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); -} - -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) -{ - struct cfs_rq *cfs_rq; - u64 runtime = remaining; - - rcu_read_lock(); - list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, - throttled_list) { - struct rq *rq = rq_of(cfs_rq); - - raw_spin_lock(&rq->lock); - if (!cfs_rq_throttled(cfs_rq)) - goto next; - - runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; - - cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; - - /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); - -next: - raw_spin_unlock(&rq->lock); - - if (!remaining) - break; - } - rcu_read_unlock(); - - return remaining; -} - -/* - * Responsible for refilling a task_group's bandwidth and unthrottling its - * cfs_rqs as appropriate. If there has been no activity within the last - * period the timer is deactivated until scheduling resumes; cfs_b->idle is - * used to track this state. - */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) -{ - u64 runtime, runtime_expires; - int idle = 1, throttled; - - raw_spin_lock(&cfs_b->lock); - /* no need to continue the timer with no bandwidth constraint */ - if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; - cfs_b->nr_periods += overrun; - - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; - - __refill_cfs_bandwidth_runtime(cfs_b); - - if (!throttled) { - /* mark as potentially idle for the upcoming period */ - cfs_b->idle = 1; - goto out_unlock; - } - - /* account preceding periods in which throttling occurred */ - cfs_b->nr_throttled += overrun; - - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; - runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; - - /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. - */ - while (throttled && runtime > 0) { - raw_spin_unlock(&cfs_b->lock); - /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); - raw_spin_lock(&cfs_b->lock); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); - } - - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; - /* - * While we are ensured activity in the period following an - * unthrottle, this also covers the case in which the new bandwidth is - * insufficient to cover the existing bandwidth deficit. (Forcing the - * timer to remain active while there are any throttled entities.) - */ - cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - - return idle; -} - -/* a cfs_rq won't donate quota below this amount */ -static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; -/* minimum remaining period time to redistribute slack quota */ -static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; -/* how long we wait to gather additional slack before distributing */ -static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; - -/* are we near the end of the current quota period? */ -static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) -{ - struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; - - /* if the call-back is running a quota refresh is already occurring */ - if (hrtimer_callback_running(refresh_timer)) - return 1; - - /* is a quota refresh about to occur? */ - remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) - return 1; - - return 0; -} - -static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) -{ - u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; - - /* if there's a quota refresh soon don't bother with slack */ - if (runtime_refresh_within(cfs_b, min_left)) - return; - - start_bandwidth_timer(&cfs_b->slack_timer, - ns_to_ktime(cfs_bandwidth_slack_period)); -} - -/* we know any runtime found here is valid as update_curr() precedes return */ -static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; - - if (slack_runtime <= 0) - return; - - raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { - cfs_b->runtime += slack_runtime; - - /* we are under rq->lock, defer unthrottling using a timer */ - if (cfs_b->runtime > sched_cfs_bandwidth_slice() && - !list_empty(&cfs_b->throttled_cfs_rq)) - start_cfs_slack_bandwidth(cfs_b); - } - raw_spin_unlock(&cfs_b->lock); - - /* even if it's not valid for return we don't want to try again */ - cfs_rq->runtime_remaining -= slack_runtime; -} - -static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return; - - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) - return; - - __return_cfs_rq_runtime(cfs_rq); -} - -/* - * This is done with a timer (instead of inline with bandwidth return) since - * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. - */ -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) -{ - u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; - - /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) - return; - - raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { - runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } - expires = cfs_b->runtime_expires; - raw_spin_unlock(&cfs_b->lock); - - if (!runtime) - return; - - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - - raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; - raw_spin_unlock(&cfs_b->lock); -} - -/* - * When a group wakes up we want to make sure that its quota is not already - * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. - */ -static void check_enqueue_throttle(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return; - - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) - return; - - /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) - return; - - /* update runtime allocation */ - account_cfs_rq_runtime(cfs_rq, 0); - if (cfs_rq->runtime_remaining <= 0) - throttle_cfs_rq(cfs_rq); -} - -/* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return; - - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; - - /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. - */ - if (cfs_rq_throttled(cfs_rq)) - return; - - throttle_cfs_rq(cfs_rq); -} - -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - -static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, slack_timer); - do_sched_cfs_slack_timer(cfs_b); - - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - - if (!overrun) - break; - - idle = do_sched_cfs_period_timer(cfs_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - raw_spin_lock_init(&cfs_b->lock); - cfs_b->runtime = 0; - cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); - - INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; -} - -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - cfs_rq->runtime_enabled = 0; - INIT_LIST_HEAD(&cfs_rq->throttled_list); -} - -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { - raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) - return; - } - - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); -} - -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - hrtimer_cancel(&cfs_b->period_timer); - hrtimer_cancel(&cfs_b->slack_timer); -} - -void unthrottle_offline_cfs_rqs(struct rq *rq) -{ - struct cfs_rq *cfs_rq; - - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - if (!cfs_rq->runtime_enabled) - continue; - - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = cfs_b->quota; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); - } -} - -#else /* CONFIG_CFS_BANDWIDTH */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} - -static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) -{ - return 0; -} - -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -#endif - -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return NULL; -} -static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} - -#endif /* CONFIG_CFS_BANDWIDTH */ - -/************************************************** - * CFS operations on tasks: - */ - -#ifdef CONFIG_SCHED_HRTICK -static void hrtick_start_fair(struct rq *rq, struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - WARN_ON(task_rq(p) != rq); - - if (cfs_rq->nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; - s64 delta = slice - ran; - - if (delta < 0) { - if (rq->curr == p) - resched_task(p); - return; - } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - - hrtick_start(rq, delta); - } -} - -/* - * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. - */ -static void hrtick_update(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) - return; - - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); -} -#else /* !CONFIG_SCHED_HRTICK */ -static inline void -hrtick_start_fair(struct rq *rq, struct task_struct *p) -{ -} - -static inline void hrtick_update(struct rq *rq) -{ -} -#endif - -/* - * The enqueue_task method is called before nr_running is - * increased. Here we update the fair scheduling stats and - * then put the task into the rbtree: - */ -static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - if (se->on_rq) - break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, flags); - - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; - cfs_rq->h_nr_running++; - - flags = ENQUEUE_WAKEUP; - } - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_running++; - - if (cfs_rq_throttled(cfs_rq)) - break; - - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } - - if (!se) - inc_nr_running(rq); - hrtick_update(rq); -} - -static void set_next_buddy(struct sched_entity *se); - -/* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: - */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); - - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; - cfs_rq->h_nr_running--; - - /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) { - /* - * Bias pick_next to pick a task from this cfs_rq, as - * p is sleeping when it is within its sched_slice. - */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); - break; - } - flags |= DEQUEUE_SLEEP; - } - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_running--; - - if (cfs_rq_throttled(cfs_rq)) - break; - - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } - - if (!se) - dec_nr_running(rq); - hrtick_update(rq); -} - -#ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -static unsigned long power_of(int cpu) -{ - return cpu_rq(cpu)->cpu_power; -} - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - return rq->load.weight / nr_running; - - return 0; -} - - -static void task_waking_fair(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j (1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent) /* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - long w, W; - - tg = se->my_q->tg; - - /* - * W = @wg + \Sum rw_j - */ - W = wg + calc_tg_weight(tg, se->my_q); - - /* - * w = rw_i + @wl - */ - w = se->my_q->load.weight + wl; - - /* - * wl = S * s'_i; see (2) - */ - if (W > 0 && w < W) - wl = (w * tg->shares) / W; - else - wl = tg->shares; - - /* - * Per the above, wl is the new se->load.weight value; since - * those are clipped to [MIN_SHARES, ...) do so now. See - * calc_cfs_shares(). - */ - if (wl < MIN_SHARES) - wl = MIN_SHARES; - - /* - * wl = dw_i = S * (s'_i - s_i); see (3) - */ - wl -= se->load.weight; - - /* - * Recursively apply this logic to all parent groups to compute - * the final effective load change on the root group. Since - * only the @tg group gets extra weight, all parent groups can - * only redistribute existing shares. @wl is the shift in shares - * resulting from this level per the above. - */ - wg = 0; - } - - return wl; -} -#else - -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) -{ - return wl; -} - -#endif - -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) -{ - s64 this_load, load; - int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.load.weight; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.load.weight; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; - - this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; - - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); - - return 1; - } - return 0; -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - tsk_cpus_allowed(p))) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; - - if (local_group) { - this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * Try and locate an idle CPU in the sched_domain. - */ -static int select_idle_sibling(struct task_struct *p, int target) -{ - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); - struct sched_domain *sd; - struct sched_group *sg; - int i; - - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; - - /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. - */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; - - /* - * Otherwise, iterate the domains and find an elegible idle cpu. - */ - rcu_read_lock(); - - sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); - } -done: - rcu_read_unlock(); - - return target; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) -{ - struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); - int new_cpu = cpu; - int want_affine = 0; - int want_sd = 1; - int sync = wake_flags & WF_SYNC; - - if (p->rt.nr_cpus_allowed == 1) - return prev_cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } - - rcu_read_lock(); - for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; - - /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - nr_running /= 2; - - if (nr_running < capacity) - want_sd = 0; - } - - /* - * If both cpu and prev_cpu are part of this domain, - * cpu is a valid SD_WAKE_AFFINE target. - */ - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) - break; - - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) - sd = tmp; - } - - if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; - } - - while (sd) { - int load_idx = sd->forkexec_idx; - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } -unlock: - rcu_read_unlock(); - - return new_cpu; -} -#endif /* CONFIG_SMP */ - -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; - - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); -} - -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff <= 0) - return -1; - - gran = wakeup_gran(curr, se); - if (vdiff > gran) - return 1; - - return 0; -} - -static void set_last_buddy(struct sched_entity *se) -{ - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) - return; - - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; -} - -static void set_next_buddy(struct sched_entity *se) -{ - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) - return; - - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; -} - -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -{ - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; - int next_buddy_marked = 0; - - if (unlikely(se == pse)) - return; - - /* - * This is possible from callers such as pull_task(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have - * lead to a throttle). This both saves work and prevents false - * next-buddy nomination below. - */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) - return; - - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { - set_next_buddy(pse); - next_buddy_marked = 1; - } - - /* - * We can come here with TIF_NEED_RESCHED already set from new task - * wake up path. - * - * Note: this also catches the edge-case of curr being in a throttled - * group (e.g. via set_curr_task), since update_curr() (in the - * enqueue of curr) will have resulted in resched being set. This - * prevents us from potentially nominating it as a false LAST_BUDDY - * below. - */ - if (test_tsk_need_resched(curr)) - return; - - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(curr->policy == SCHED_IDLE) && - likely(p->policy != SCHED_IDLE)) - goto preempt; - - /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ - if (unlikely(p->policy != SCHED_NORMAL)) - return; - - find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); - BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); - goto preempt; - } - - return; - -preempt: - resched_task(curr); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); -} - -static struct task_struct *pick_next_task_fair(struct rq *rq) -{ - struct task_struct *p; - struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; - - if (!cfs_rq->nr_running) - return NULL; - - do { - se = pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - return p; -} - -/* - * Account for a descheduled task: - */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) -{ - struct sched_entity *se = &prev->se; - struct cfs_rq *cfs_rq; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); - } -} - -/* - * sched_yield() is very simple - * - * The magic of dealing with the ->skip buddy is in pick_next_entity. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq->skip_clock_update = 1; - } - - set_skip_buddy(se); -} - -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) -{ - struct sched_entity *se = &p->se; - - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) - return false; - - /* Tell the scheduler that we'd really like pse to run next. */ - set_next_buddy(se); - - yield_task_fair(rq); - - return true; -} - -#ifdef CONFIG_SMP -/************************************************** - * Fair scheduling class load-balancing methods: - */ - -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) -{ - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); -} - -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(p->policy == SCHED_IDLE)) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 - -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) -{ - int tsk_cache_hot = 0; - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); - return 0; - } - *lb_flags &= ~LBF_ALL_PINNED; - - if (task_running(rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); - return 0; - } - - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ - - tsk_cache_hot = task_hot(p, rq->clock_task, sd); - if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } -#endif - return 1; - } - - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; -} - -/* - * move_one_task tries to move exactly one task from busiest to this_rq, as - * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) -{ - struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - int pinned = 0; - - for_each_leaf_cfs_rq(busiest, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) - break; - - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) - continue; - - pull_task(busiest, p, this_rq, this_cpu); - /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). - */ - schedstat_inc(sd, lb_gained[idle]); - return 1; - } - } - - return 0; -} - -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) -{ - int loops = 0, pulled = 0; - long rem_load_move = max_load_move; - struct task_struct *p, *n; - - if (max_load_move == 0) - goto out; - - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; - break; - } - - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) - continue; - - pull_task(busiest, p, this_rq, this_cpu); - pulled++; - rem_load_move -= p->se.load.weight; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; - break; - } -#endif - - /* - * We only want to steal up to the prescribed amount of - * weighted load. - */ - if (rem_load_move <= 0) - break; - } -out: - /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_add(sd, lb_gained[idle], pulled); - - return max_load_move - rem_load_move; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static int update_shares_cpu(struct task_group *tg, int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); - - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return 0; -} - -static void update_shares(int cpu) -{ - struct cfs_rq *cfs_rq; - struct rq *rq = cpu_rq(cpu); - - rcu_read_lock(); - /* - * Iterates the task_group tree in a bottom up fashion, see - * list_add_leaf_cfs_rq() for details. - */ - for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); - } - rcu_read_unlock(); -} - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) -{ - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; - - rcu_read_lock(); - update_h_load(cpu_of(busiest)); - - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); - - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); - - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); - - return max_load_move - rem_load_move; -} -#else -static inline void update_shares(int cpu) -{ -} - -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) -{ - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); -} -#endif - -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) -{ - unsigned long total_load_moved = 0, load_moved; - - do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - - total_load_moved += load_moved; - - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - -/********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif -}; - -/* - * sg_lb_stats - stats of a sched_group required for load_balancing - */ -struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ - unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; - int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ -}; - -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - if (!sds->power_savings_balance) - return 0; - - if (sds->this != sds->group_leader || - sds->group_leader == sds->group_min) - return 0; - - *imbalance = sds->min_load_per_task; - sds->busiest = sds->group_min; - - return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return SCHED_POWER_SCALE; -} - -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - return default_scale_freq_power(sd, cpu); -} - -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) -{ - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; - - return smt_gain; -} - -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ - return default_scale_smt_power(sd, cpu); -} - -unsigned long scale_rt_power(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 total, available; - - total = sched_avg_period() + (rq->clock - rq->age_stamp); - - if (unlikely(total < rq->rt_avg)) { - /* Ensures that power won't end up being negative */ - available = 0; - } else { - available = total - rq->rt_avg; - } - - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; - - total >>= SCHED_POWER_SHIFT; - - return div_u64(available, total); -} - -static void update_cpu_power(struct sched_domain *sd, int cpu) -{ - unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; - struct sched_group *sdg = sd->groups; - - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); - else - power *= default_scale_smt_power(sd, cpu); - - power >>= SCHED_POWER_SHIFT; - } - - sdg->sgp->power_orig = power; - - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_POWER_SHIFT; - - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; - - if (!power) - power = 1; - - cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; -} - -void update_group_power(struct sched_domain *sd, int cpu) -{ - struct sched_domain *child = sd->child; - struct sched_group *group, *sdg = sd->groups; - unsigned long power; - - if (!child) { - update_cpu_power(sd, cpu); - return; - } - - power = 0; - - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); - - sdg->sgp->power = power; -} - -/* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). - */ -static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) -{ - /* - * Only siblings can have significantly less than SCHED_POWER_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) - return 0; - - /* - * If ~90% of the cpu_power is still there, we're good. - */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) - return 1; - - return 0; -} - -/** - * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. - * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu - * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. - * @sgs: variable to hold the statistics for this group. - */ -static inline void update_sg_lb_stats(struct sched_domain *sd, - struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) -{ - unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; - int i; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long avg_load_per_task = 0; - - if (local_group) - balance_cpu = group_first_cpu(group); - - /* Tally up the load of all CPUs in the group */ - max_cpu_load = 0; - min_cpu_load = ~0UL; - max_nr_running = 0; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { - struct rq *rq = cpu_rq(i); - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) { - max_cpu_load = load; - max_nr_running = rq->nr_running; - } - if (min_cpu_load > load) - min_cpu_load = load; - } - - sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; - sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) - sgs->idle_cpus++; - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); - } - - /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) - sgs->group_imb = 1; - - sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, - SCHED_POWER_SCALE); - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(sd, group); - sgs->group_weight = group->group_weight; - - if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; -} - -/** - * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked - * @sds: sched_domain statistics - * @sg: sched_group candidate to be checked for being the busiest - * @sgs: sched_group statistics - * @this_cpu: the current cpu - * - * Determine if @sg is a busier group than the previously selected - * busiest group. - */ -static bool update_sd_pick_busiest(struct sched_domain *sd, - struct sd_lb_stats *sds, - struct sched_group *sg, - struct sg_lb_stats *sgs, - int this_cpu) -{ - if (sgs->avg_load <= sds->max_load) - return false; - - if (sgs->sum_nr_running > sgs->group_capacity) - return true; - - if (sgs->group_imb) - return true; - - /* - * ASYM_PACKING needs to move all the work to the lowest - * numbered CPUs in the group, therefore mark all groups - * higher than ourself as busy. - */ - if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - this_cpu < group_first_cpu(sg)) { - if (!sds->busiest) - return true; - - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) - return true; - } - - return false; -} - -/** - * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu - * @cpus: Set of cpus considered for load balancing. - * @balance: Should we balance. - * @sds: variable to hold the statistics for this sched_domain. - */ -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) -{ - struct sched_domain *child = sd->child; - struct sched_group *sg = sd->groups; - struct sg_lb_stats sgs; - int load_idx, prefer_sibling = 0; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; - - init_sd_power_savings_stats(sd, sds, idle); - load_idx = get_sd_load_idx(sd, idle); - - do { - int local_group; - - local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); - memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, - local_group, cpus, balance, &sgs); - - if (local_group && !(*balance)) - return; - - sds->total_load += sgs.group_load; - sds->total_pwr += sg->sgp->power; - - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). - */ - if (prefer_sibling && !local_group && sds->this_has_capacity) - sgs.group_capacity = min(sgs.group_capacity, 1UL); - - if (local_group) { - sds->this_load = sgs.avg_load; - sds->this = sg; - sds->this_nr_running = sgs.sum_nr_running; - sds->this_load_per_task = sgs.sum_weighted_load; - sds->this_has_capacity = sgs.group_has_capacity; - sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { - sds->max_load = sgs.avg_load; - sds->busiest = sg; - sds->busiest_nr_running = sgs.sum_nr_running; - sds->busiest_idle_cpus = sgs.idle_cpus; - sds->busiest_group_capacity = sgs.group_capacity; - sds->busiest_load_per_task = sgs.sum_weighted_load; - sds->busiest_has_capacity = sgs.group_has_capacity; - sds->busiest_group_weight = sgs.group_weight; - sds->group_imb = sgs.group_imb; - } - - update_sd_power_savings_stats(sg, sds, local_group, &sgs); - sg = sg->next; - } while (sg != sd->groups); -} - -/** - * check_asym_packing - Check to see if the group is packed into the - * sched doman. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Returns 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in *imbalance. - * - * @sd: The sched_domain whose packing is to be checked. - * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. - */ -static int check_asym_packing(struct sched_domain *sd, - struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - int busiest_cpu; - - if (!(sd->flags & SD_ASYM_PACKING)) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = group_first_cpu(sds->busiest); - if (this_cpu > busiest_cpu) - return 0; - - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, - SCHED_POWER_SCALE); - return 1; -} - -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. - */ -static inline void fix_small_imbalance(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - unsigned long tmp, pwr_now = 0, pwr_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; - - if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; - if (sds->busiest_load_per_task > - sds->this_load_per_task) - imbn = 1; - } else - sds->this_load_per_task = - cpu_avg_load_per_task(this_cpu); - - scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; - - if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { - *imbalance = sds->busiest_load_per_task; - return; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_POWER_SCALE; - - /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; - if (sds->max_load > tmp) - pwr_move += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load - tmp); - - /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; - pwr_move += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_POWER_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = sds->busiest_load_per_task; -} - -/** - * calculate_imbalance - Calculate the amount of imbalance present within the - * groups of a given sched_domain during load balance. - * @sds: statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: Cpu for which currently load balance is being performed. - * @imbalance: The variable to store the imbalance. - */ -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, - unsigned long *imbalance) -{ - unsigned long max_pull, load_above_capacity = ~0UL; - - sds->busiest_load_per_task /= sds->busiest_nr_running; - if (sds->group_imb) { - sds->busiest_load_per_task = - min(sds->busiest_load_per_task, sds->avg_load); - } - - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (sds->max_load < sds->avg_load) { - *imbalance = 0; - return fix_small_imbalance(sds, this_cpu, imbalance); - } - - if (!sds->group_imb) { - /* - * Don't want to pull so many tasks that a group would go idle. - */ - load_above_capacity = (sds->busiest_nr_running - - sds->busiest_group_capacity); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - - load_above_capacity /= sds->busiest->sgp->power; - } - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load. At the same time, - * we also don't want to reduce the group load below the group capacity - * (so that we can implement power-savings policies etc). Thus we look - * for the minimum possible imbalance. - * Be careful of negative numbers as they'll appear as very large values - * with unsigned longs. - */ - max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->sgp->power, - (sds->avg_load - sds->this_load) * sds->this->sgp->power) - / SCHED_POWER_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < sds->busiest_load_per_task) - return fix_small_imbalance(sds, this_cpu, imbalance); - -} - -/******* find_busiest_group() helpers end here *********************/ - -/** - * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. - * - * Also calculates the amount of weighted load which should be moved - * to restore balance. - * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. - * @cpus: The set of CPUs under consideration for load-balancing. - * @balance: Pointer to a variable indicating if this_cpu - * is the appropriate cpu to perform load balancing at this_level. - * - * Returns: - the busiest group if imbalance exists. - * - If no imbalance and user has opted for power-savings balance, - * return the least loaded group whose CPUs can be - * put to idle by rebalancing its tasks onto our group. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - const struct cpumask *cpus, int *balance) -{ - struct sd_lb_stats sds; - - memset(&sds, 0, sizeof(sds)); - - /* - * Compute the various statistics relavent for load balancing at - * this level. - */ - update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); - - /* - * this_cpu is not the appropriate cpu to perform load balancing at - * this level. - */ - if (!(*balance)) - goto ret; - - if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && - check_asym_packing(sd, &sds, this_cpu, imbalance)) - return sds.busiest; - - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || sds.busiest_nr_running == 0) - goto out_balanced; - - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; - - /* - * If the busiest group is imbalanced the below checks don't - * work because they assumes all things are equal, which typically - * isn't true due to cpus_allowed constraints and the like. - */ - if (sds.group_imb) - goto force_balance; - - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && - !sds.busiest_has_capacity) - goto force_balance; - - /* - * If the local group is more busy than the selected busiest group - * don't try and pull any tasks. - */ - if (sds.this_load >= sds.max_load) - goto out_balanced; - - /* - * Don't pull any tasks if this group is already above the domain - * average load. - */ - if (sds.this_load >= sds.avg_load) - goto out_balanced; - - if (idle == CPU_IDLE) { - /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. - */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && - sds.busiest_nr_running <= sds.busiest_group_weight) - goto out_balanced; - } else { - /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. - */ - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } - -force_balance: - /* Looks like there is an imbalance. Compute it */ - calculate_imbalance(&sds, this_cpu, imbalance); - return sds.busiest; - -out_balanced: - /* - * There is no obvious imbalance. But check if we can do some balancing - * to save power. - */ - if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) - return sds.busiest; -ret: - *imbalance = 0; - return NULL; -} - -/* - * find_busiest_queue - find the busiest runqueue among the cpus in group. - */ -static struct rq * -find_busiest_queue(struct sched_domain *sd, struct sched_group *group, - enum cpu_idle_type idle, unsigned long imbalance, - const struct cpumask *cpus) -{ - struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; - int i; - - for_each_cpu(i, sched_group_cpus(group)) { - unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, - SCHED_POWER_SCALE); - unsigned long wl; - - if (!capacity) - capacity = fix_small_capacity(sd, group); - - if (!cpumask_test_cpu(i, cpus)) - continue; - - rq = cpu_rq(i); - wl = weighted_cpuload(i); - - /* - * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. - */ - if (capacity && rq->nr_running == 1 && wl > imbalance) - continue; - - /* - * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. - */ - wl = (wl * SCHED_POWER_SCALE) / power; - - if (wl > max_load) { - max_load = wl; - busiest = rq; - } - } - - return busiest; -} - -/* - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but - * so long as it is large enough. - */ -#define MAX_PINNED_INTERVAL 512 - -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); - -static int need_active_balance(struct sched_domain *sd, int idle, - int busiest_cpu, int this_cpu) -{ - if (idle == CPU_NEWLY_IDLE) { - - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * higher numbered CPUs in order to pack all tasks in the - * lowest numbered CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) - return 1; - - /* - * The only task running in a non-idle cpu can be moved to this - * cpu in an attempt to completely freeup the other CPU - * package. - * - * The package power saving logic comes from - * find_busiest_group(). If there are no imbalance, then - * f_b_g() will return NULL. However when sched_mc={1,2} then - * f_b_g() will select a group from which a running task may be - * pulled to this cpu in order to make the other package idle. - * If there is no opportunity to make a package idle and if - * there are no imbalance, then f_b_g() will return NULL and no - * action will be taken in load_balance_newidle(). - * - * Under normal task pull operation due to imbalance, there - * will be more than one task in the source run queue and - * move_tasks() will succeed. ld_moved will be true and this - * active balance code will not be triggered. - */ - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) - return 0; - } - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - -static int active_load_balance_cpu_stop(void *data); - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - */ -static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) -{ - int ld_moved, lb_flags = 0, active_balance = 0; - struct sched_group *group; - unsigned long imbalance; - struct rq *busiest; - unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - - cpumask_copy(cpus, cpu_active_mask); - - schedstat_inc(sd, lb_count[idle]); - -redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, - cpus, balance); - - if (*balance == 0) - goto out_balanced; - - if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); - goto out_balanced; - } - - busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); - if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); - goto out_balanced; - } - - BUG_ON(busiest == this_rq); - - schedstat_add(sd, lb_imbalance[idle], imbalance); - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. ld_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - lb_flags |= LBF_ALL_PINNED; - local_irq_save(flags); - double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); - double_rq_unlock(this_rq, busiest); - local_irq_restore(flags); - - /* - * some other cpu did the load balance for us. - */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); - - if (lb_flags & LBF_ABORT) - goto out_balanced; - - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) - goto out_balanced; - goto redo; - } - - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) - goto redo; - goto out_balanced; - } - } - - if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); - /* - * Increment the failure counter only on periodic balance. - * We do not want newidle balance, which can be very - * frequent, pollute the failure counter causing - * excessive cache_hot migrations and active balances. - */ - if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; - - if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { - raw_spin_lock_irqsave(&busiest->lock, flags); - - /* don't kick the active_load_balance_cpu_stop, - * if the curr task on busiest cpu can't be - * moved to this_cpu - */ - if (!cpumask_test_cpu(this_cpu, - tsk_cpus_allowed(busiest->curr))) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); - lb_flags |= LBF_ALL_PINNED; - goto out_one_pinned; - } - - /* - * ->active_balance synchronizes accesses to - * ->active_balance_work. Once set, it's cleared - * only after active load balance is finished. - */ - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - raw_spin_unlock_irqrestore(&busiest->lock, flags); - - if (active_balance) - stop_one_cpu_nowait(cpu_of(busiest), - active_load_balance_cpu_stop, busiest, - &busiest->active_balance_work); - - /* - * We've kicked active balancing, reset the failure - * counter. - */ - sd->nr_balance_failed = sd->cache_nice_tries+1; - } - } else - sd->nr_balance_failed = 0; - - if (likely(!active_balance)) { - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * move_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - } - - goto out; - -out_balanced: - schedstat_inc(sd, lb_balanced[idle]); - - sd->nr_balance_failed = 0; - -out_one_pinned: - /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && - sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) - sd->balance_interval *= 2; - - ld_moved = 0; -out: - return ld_moved; -} - -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - */ -void idle_balance(int this_cpu, struct rq *this_rq) -{ - struct sched_domain *sd; - int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; - - this_rq->idle_stamp = this_rq->clock; - - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; - - /* - * Drop the rq->lock, but keep IRQ/preempt disabled. - */ - raw_spin_unlock(&this_rq->lock); - - update_shares(this_cpu); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - unsigned long interval; - int balance = 1; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (sd->flags & SD_BALANCE_NEWIDLE) { - /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, &balance); - } - - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; - break; - } - } - rcu_read_unlock(); - - raw_spin_lock(&this_rq->lock); - - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ - this_rq->next_balance = next_balance; - } -} - -/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - schedstat_inc(sd, alb_count); - - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; -} - -#ifdef CONFIG_NO_HZ -/* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing - * needed, they will kick the idle load balancer, which then does idle - * load balancing for all the idle CPUs. - */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd->flags & flag) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu: The cpu which is nominating a new idle_load_balancer. - * - * Returns: Returns the id of the idle load balancer if it exists, - * Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) -{ - int ilb = cpumask_first(nohz.idle_cpus_mask); - struct sched_group *ilbg; - struct sched_domain *sd; - - /* - * Have idle load balancer selection from semi-idle packages only - * when power-aware load balancing is enabled - */ - if (!(sched_smt_power_savings || sched_mc_power_savings)) - goto out_done; - - /* - * Optimize for the case when we have no idle CPUs or only one - * idle CPU. Don't walk the sched_domain hierarchy in such cases - */ - if (cpumask_weight(nohz.idle_cpus_mask) < 2) - goto out_done; - - rcu_read_lock(); - for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilbg = sd->groups; - - do { - if (ilbg->group_weight != - atomic_read(&ilbg->sgp->nr_busy_cpus)) { - ilb = cpumask_first_and(nohz.idle_cpus_mask, - sched_group_cpus(ilbg)); - goto unlock; - } - - ilbg = ilbg->next; - - } while (ilbg != sd->groups); - } -unlock: - rcu_read_unlock(); - -out_done: - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; - - return nr_cpu_ids; -} -#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ - return nr_cpu_ids; -} -#endif - -/* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). - */ -static void nohz_balancer_kick(int cpu) -{ - int ilb_cpu; - - nohz.next_balance++; - - ilb_cpu = find_new_ilb(cpu); - - if (ilb_cpu >= nr_cpu_ids) - return; - - if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) - return; - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); - return; -} - -static inline void clear_nohz_tick_stopped(int cpu) -{ - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } -} - -static inline void set_cpu_sd_state_busy(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); - rcu_read_unlock(); -} - -void set_cpu_sd_state_idle(void) -{ - struct sched_domain *sd; - int cpu = smp_processor_id(); - - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - - rcu_read_lock(); - for_each_domain(cpu, sd) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); - rcu_read_unlock(); -} - -/* - * This routine will record that this cpu is going idle with tick stopped. - * This info will be used in performing idle load balancing in the future. - */ -void select_nohz_load_balancer(int stop_tick) -{ - int cpu = smp_processor_id(); - - /* - * If this cpu is going down, then nothing needs to be done. - */ - if (!cpu_active(cpu)) - return; - - if (stop_tick) { - if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) - return; - - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - } - return; -} - -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - clear_nohz_tick_stopped(smp_processor_id()); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} -#endif - -static DEFINE_SPINLOCK(balancing); - -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - -/* - * Scale the max load_balance interval with the number of CPUs in the system. - * This trades load-balance latency on larger machines for less cross talk. - */ -void update_max_interval(void) -{ - max_load_balance_interval = HZ*num_online_cpus()/10; -} - -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) -{ - int balance = 1; - struct rq *rq = cpu_rq(cpu); - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize; - - update_shares(cpu); - - rcu_read_lock(); - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); - - need_serialize = sd->flags & SD_SERIALIZE; - - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { - /* - * We've pulled tasks over so either we're no - * longer idle. - */ - idle = CPU_NOT_IDLE; - } - sd->last_balance = jiffies; - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!balance) - break; - } - rcu_read_unlock(); - - /* - * next_balance will be updated only when there is a need. - * When the cpu is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - rq->next_balance = next_balance; -} - -#ifdef CONFIG_NO_HZ -/* - * In CONFIG_NO_HZ case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. - */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) -{ - struct rq *this_rq = cpu_rq(this_cpu); - struct rq *rq; - int balance_cpu; - - if (idle != CPU_IDLE || - !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) - goto end; - - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - raw_spin_lock_irq(&this_rq->lock); - update_rq_clock(this_rq); - update_cpu_load(this_rq); - raw_spin_unlock_irq(&this_rq->lock); - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - nohz.next_balance = this_rq->next_balance; -end: - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); -} - -/* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. - * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. - */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) -{ - unsigned long now = jiffies; - struct sched_domain *sd; - - if (unlikely(idle_cpu(cpu))) - return 0; - - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - set_cpu_sd_state_busy(); - clear_nohz_tick_stopped(cpu); - - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; - - if (time_before(now, nohz.next_balance)) - return 0; - - if (rq->nr_running >= 2) - goto need_kick; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); - - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; - - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; - } - rcu_read_unlock(); - return 0; - -need_kick_unlock: - rcu_read_unlock(); -need_kick: - return 1; -} -#else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } -#endif - -/* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). - */ -static void run_rebalance_domains(struct softirq_action *h) -{ - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - - rebalance_domains(this_cpu, idle); - - /* - * If this cpu has a pending nohz_balance_kick, then do the - * balancing on behalf of the other idle cpus whose ticks are - * stopped. - */ - nohz_idle_balance(this_cpu, idle); -} - -static inline int on_null_domain(int cpu) -{ - return !rcu_dereference_sched(cpu_rq(cpu)->sd); -} - -/* - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - */ -void trigger_load_balance(struct rq *rq, int cpu) -{ - /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) - raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); -#endif -} - -static void rq_online_fair(struct rq *rq) -{ - update_sysctl(); -} - -static void rq_offline_fair(struct rq *rq) -{ - update_sysctl(); -} - -#endif /* CONFIG_SMP */ - -/* - * scheduler tick hitting a task of our scheduling class: - */ -static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &curr->se; - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - entity_tick(cfs_rq, se, queued); - } -} - -/* - * called on fork with the child task as argument from the parent's context - * - child not yet on the tasklist - * - preemption disabled - */ -static void task_fork_fair(struct task_struct *p) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); - struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } - - update_curr(cfs_rq); - - if (curr) - se->vruntime = curr->vruntime; - place_entity(cfs_rq, se, 1); - - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); - } - - se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -/* - * Priority of the task has changed. Check to see if we preempt - * the current task. - */ -static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) -{ - if (!p->se.on_rq) - return; - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (rq->curr == p) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); -} - -static void switched_from_fair(struct rq *rq, struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - /* - * Ensure the task's vruntime is normalized, so that when its - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. - * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when - * the task is sleeping will it still have non-normalized vruntime. - */ - if (!se->on_rq && p->state != TASK_RUNNING) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } -} - -/* - * We switched to the sched_fair class. - */ -static void switched_to_fair(struct rq *rq, struct task_struct *p) -{ - if (!p->se.on_rq) - return; - - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); -} - -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_curr_task_fair(struct rq *rq) -{ - struct sched_entity *se = &rq->curr->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - set_next_entity(cfs_rq, se); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } -} - -void init_cfs_rq(struct cfs_rq *cfs_rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) -{ - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; - - if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; - set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; -} - -void free_fair_sched_group(struct task_group *tg) -{ - int i; - - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - - init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); - } - - return 1; - -err_free_rq: - kfree(cfs_rq); -err: - return 0; -} - -void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - cfs_rq->tg = tg; - cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif - init_cfs_rq_runtime(cfs_rq); - - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - - /* se could be NULL for root_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - update_load_set(&se->load, 0); - se->parent = parent; -} - -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - tg->shares = shares; - for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); - struct sched_entity *se; - - se = tg->se[i]; - /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - -done: - mutex_unlock(&shares_mutex); - return 0; -} -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - - -static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) -{ - struct sched_entity *se = &task->se; - unsigned int rr_interval = 0; - - /* - * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise - * idle runqueue: - */ - if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - - return rr_interval; -} - -/* - * All the scheduling class methods: - */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, - .enqueue_task = enqueue_task_fair, - .dequeue_task = dequeue_task_fair, - .yield_task = yield_task_fair, - .yield_to_task = yield_to_task_fair, - - .check_preempt_curr = check_preempt_wakeup, - - .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_fair, - - .rq_online = rq_online_fair, - .rq_offline = rq_offline_fair, - - .task_waking = task_waking_fair, -#endif - - .set_curr_task = set_curr_task_fair, - .task_tick = task_tick_fair, - .task_fork = task_fork_fair, - - .prio_changed = prio_changed_fair, - .switched_from = switched_from_fair, - .switched_to = switched_to_fair, - - .get_rr_interval = get_rr_interval_fair, - -#ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, -#endif -}; - -#ifdef CONFIG_SCHED_DEBUG -void print_cfs_stats(struct seq_file *m, int cpu) -{ - struct cfs_rq *cfs_rq; - - rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) - print_cfs_rq(m, cpu, cfs_rq); - rcu_read_unlock(); -} -#endif - -__init void init_sched_fair_class(void) -{ -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); - -#ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - cpu_notifier(sched_ilb_notifier, 0); -#endif -#endif /* SMP */ - -} -#include "sched.h" - -/* - * idle-task scheduling class. - * - * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched_fair.c) - */ - -#ifdef CONFIG_SMP -static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) -{ - return task_cpu(p); /* IDLE tasks as never migrated */ -} -#endif /* CONFIG_SMP */ -/* - * Idle tasks are unconditionally rescheduled: - */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) -{ - resched_task(rq->idle); -} - -static struct task_struct *pick_next_task_idle(struct rq *rq) -{ - schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); - return rq->idle; -} - -/* - * It is not legal to sleep in the idle task - print a warning - * message if some code attempts to do it: - */ -static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) -{ - raw_spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); - dump_stack(); - raw_spin_lock_irq(&rq->lock); -} - -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - -static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_idle(struct rq *rq) -{ -} - -static void switched_to_idle(struct rq *rq, struct task_struct *p) -{ - BUG(); -} - -static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) -{ - BUG(); -} - -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - -/* - * Simple, special scheduling class for the per-CPU idle tasks: - */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ - /* no enqueue/yield_task for idle tasks */ - - /* dequeue is not valid, we print a debug message there: */ - .dequeue_task = dequeue_task_idle, - - .check_preempt_curr = check_preempt_curr_idle, - - .pick_next_task = pick_next_task_idle, - .put_prev_task = put_prev_task_idle, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_idle, -#endif - - .set_curr_task = set_curr_task_idle, - .task_tick = task_tick_idle, - - .get_rr_interval = get_rr_interval_idle, - - .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, -}; -/* - * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR - * policies) - */ - -#include "sched.h" - -#include - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -struct rt_bandwidth def_rt_bandwidth; - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - raw_spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); - raw_spin_unlock(&rt_b->rt_runtime_lock); -} - -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} - -#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) - -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif - return container_of(rt_se, struct task_struct, rt); -} - -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return rt_rq->rq; -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - return rt_se->rt_rq; -} - -void free_rt_sched_group(struct task_group *tg) -{ - int i; - - if (tg->rt_se) - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; - rt_rq->tg = tg; - - tg->rt_rq[cpu] = rt_rq; - tg->rt_se[cpu] = rt_se; - - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} - -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err_free_rq; - - init_rt_rq(rt_rq, cpu_rq(i)); - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); - } - - return 1; - -err_free_rq: - kfree(rt_rq); -err: - return 0; -} - -#else /* CONFIG_RT_GROUP_SCHED */ - -#define rt_entity_is_task(rt_se) (1) - -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) -{ - return container_of(rt_se, struct task_struct, rt); -} - -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) -{ - return container_of(rt_rq, struct rq, rt); -} - -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) -{ - struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); - - return &rq->rt; -} - -void free_rt_sched_group(struct task_group *tg) { } - -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP - -static inline int rt_overloaded(struct rq *rq) -{ - return atomic_read(&rq->rd->rto_count); -} - -static inline void rt_set_overload(struct rq *rq) -{ - if (!rq->online) - return; - - cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); - /* - * Make sure the mask is visible before we set - * the overload count. That is checked to determine - * if we should look at the mask. It would be a shame - * if we looked at the mask, but the mask was not - * updated yet. - */ - wmb(); - atomic_inc(&rq->rd->rto_count); -} - -static inline void rt_clear_overload(struct rq *rq) -{ - if (!rq->online) - return; - - /* the order here really doesn't matter */ - atomic_dec(&rq->rd->rto_count); - cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); -} - -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - if (!rt_entity_is_task(rt_se)) - return; - - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - if (!rt_entity_is_task(rt_se)) - return; - - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - -static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ - plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); - plist_node_init(&p->pushable_tasks, p->prio); - plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); - - /* Update the highest prio pushable task */ - if (p->prio < rq->rt.highest_prio.next) - rq->rt.highest_prio.next = p->prio; -} - -static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ - plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); - - /* Update the new highest prio pushable task */ - if (has_pushable_tasks(rq)) { - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; -} - -#else - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -#endif /* CONFIG_SMP */ - -static inline int on_rt_rq(struct sched_rt_entity *rt_se) -{ - return !list_empty(&rt_se->run_list); -} - -#ifdef CONFIG_RT_GROUP_SCHED - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - if (!rt_rq->tg) - return RUNTIME_INF; - - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); -} - -typedef struct task_group *rt_rq_iter_t; - -static inline struct task_group *next_task_group(struct task_group *tg) -{ - do { - tg = list_entry_rcu(tg->list.next, - typeof(struct task_group), list); - } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); - - if (&tg->list == &task_groups) - tg = NULL; - - return tg; -} - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) - -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_add_rcu(&rt_rq->leaf_rt_rq_list, - &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = rt_se->parent) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return rt_se->my_q; -} - -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); - -static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - struct sched_rt_entity *rt_se; - - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); - - rt_se = rt_rq->tg->rt_se[cpu]; - - if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); - if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); - } -} - -static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); - - rt_se = rt_rq->tg->rt_se[cpu]; - - if (rt_se && on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - -static int rt_se_boosted(struct sched_rt_entity *rt_se) -{ - struct rt_rq *rt_rq = group_rt_rq(rt_se); - struct task_struct *p; - - if (rt_rq) - return !!rt_rq->rt_nr_boosted; - - p = rt_task_of(rt_se); - return p->prio != p->normal_prio; -} - -#ifdef CONFIG_SMP -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_rq(smp_processor_id())->rd->span; -} -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &rt_rq->tg->rt_bandwidth; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP -/* - * We ran out of runtime, see if we can borrow some from our neighbours. - */ -static int do_balance_runtime(struct rt_rq *rt_rq) -{ - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - int i, weight, more = 0; - u64 rt_period; - - weight = cpumask_weight(rd->span); - - raw_spin_lock(&rt_b->rt_runtime_lock); - rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu(i, rd->span) { - struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); - s64 diff; - - if (iter == rt_rq) - continue; - - raw_spin_lock(&iter->rt_runtime_lock); - /* - * Either all rqs have inf runtime and there's nothing to steal - * or __disable_runtime() below sets a specific rq to inf to - * indicate its been disabled and disalow stealing. - */ - if (iter->rt_runtime == RUNTIME_INF) - goto next; - - /* - * From runqueues with spare time, take 1/n part of their - * spare time, but no more than our period. - */ - diff = iter->rt_runtime - iter->rt_time; - if (diff > 0) { - diff = div_u64((u64)diff, weight); - if (rt_rq->rt_runtime + diff > rt_period) - diff = rt_period - rt_rq->rt_runtime; - iter->rt_runtime -= diff; - rt_rq->rt_runtime += diff; - more = 1; - if (rt_rq->rt_runtime == rt_period) { - raw_spin_unlock(&iter->rt_runtime_lock); - break; - } - } -next: - raw_spin_unlock(&iter->rt_runtime_lock); - } - raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; -} - -/* - * Ensure this RQ takes back all the runtime it lend to its neighbours. - */ -static void __disable_runtime(struct rq *rq) -{ - struct root_domain *rd = rq->rd; - rt_rq_iter_t iter; - struct rt_rq *rt_rq; - - if (unlikely(!scheduler_running)) - return; - - for_each_rt_rq(rt_rq, iter, rq) { - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - s64 want; - int i; - - raw_spin_lock(&rt_b->rt_runtime_lock); - raw_spin_lock(&rt_rq->rt_runtime_lock); - /* - * Either we're all inf and nobody needs to borrow, or we're - * already disabled and thus have nothing to do, or we have - * exactly the right amount of runtime to take out. - */ - if (rt_rq->rt_runtime == RUNTIME_INF || - rt_rq->rt_runtime == rt_b->rt_runtime) - goto balanced; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - - /* - * Calculate the difference between what we started out with - * and what we current have, that's the amount of runtime - * we lend and now have to reclaim. - */ - want = rt_b->rt_runtime - rt_rq->rt_runtime; - - /* - * Greedy reclaim, take back as much as we can. - */ - for_each_cpu(i, rd->span) { - struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); - s64 diff; - - /* - * Can't reclaim from ourselves or disabled runqueues. - */ - if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) - continue; - - raw_spin_lock(&iter->rt_runtime_lock); - if (want > 0) { - diff = min_t(s64, iter->rt_runtime, want); - iter->rt_runtime -= diff; - want -= diff; - } else { - iter->rt_runtime -= want; - want -= want; - } - raw_spin_unlock(&iter->rt_runtime_lock); - - if (!want) - break; - } - - raw_spin_lock(&rt_rq->rt_runtime_lock); - /* - * We cannot be left wanting - that would mean some runtime - * leaked out of the system. - */ - BUG_ON(want); -balanced: - /* - * Disable all the borrow logic by pretending we have inf - * runtime - in which case borrowing doesn't make sense. - */ - rt_rq->rt_runtime = RUNTIME_INF; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - raw_spin_unlock(&rt_b->rt_runtime_lock); - } -} - -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static void __enable_runtime(struct rq *rq) -{ - rt_rq_iter_t iter; - struct rt_rq *rt_rq; - - if (unlikely(!scheduler_running)) - return; - - /* - * Reset each runqueue's bandwidth settings - */ - for_each_rt_rq(rt_rq, iter, rq) { - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - - raw_spin_lock(&rt_b->rt_runtime_lock); - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_b->rt_runtime; - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - raw_spin_unlock(&rt_b->rt_runtime_lock); - } -} - -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int balance_runtime(struct rt_rq *rt_rq) -{ - int more = 0; - - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; - - if (rt_rq->rt_time > rt_rq->rt_runtime) { - raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); - raw_spin_lock(&rt_rq->rt_runtime_lock); - } - - return more; -} -#else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} -#endif /* CONFIG_SMP */ - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -{ - int i, idle = 1; - const struct cpumask *span; - - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - - span = sched_rt_period_mask(); - for_each_cpu(i, span) { - int enqueue = 0; - struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); - struct rq *rq = rq_of_rt_rq(rt_rq); - - raw_spin_lock(&rq->lock); - if (rt_rq->rt_time) { - u64 runtime; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_throttled) - balance_runtime(rt_rq); - runtime = rt_rq->rt_runtime; - rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - enqueue = 1; - - /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. - */ - if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; - } - if (rt_rq->rt_time || rt_rq->rt_nr_running) - idle = 0; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } else if (rt_rq->rt_nr_running) { - idle = 0; - if (!rt_rq_throttled(rt_rq)) - enqueue = 1; - } - - if (enqueue) - sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); - } - - return idle; -} - -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - -static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) -{ - u64 runtime = sched_rt_runtime(rt_rq); - - if (rt_rq->rt_throttled) - return rt_rq_throttled(rt_rq); - - if (runtime >= sched_rt_period(rt_rq)) - return 0; - - balance_runtime(rt_rq); - runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - - if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); - if (rt_rq_throttled(rt_rq)) { - sched_rt_rq_dequeue(rt_rq); - return 1; - } - } - - return 0; -} - -/* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. - */ -static void update_curr_rt(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - u64 delta_exec; - - if (curr->sched_class != &rt_sched_class) - return; - - delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq->clock_task; - cpuacct_charge(curr, delta_exec); - - sched_rt_avg_update(rq, delta_exec); - - if (!rt_bandwidth_enabled()) - return; - - for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); - - if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - } -} - -#if defined CONFIG_SMP - -static void -inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (rq->online && prio < prev_prio) - cpupri_set(&rq->rd->cpupri, rq->cpu, prio); -} - -static void -dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (rq->online && rt_rq->highest_prio.curr != prev_prio) - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); -} - -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED -static void -inc_rt_prio(struct rt_rq *rt_rq, int prio) -{ - int prev_prio = rt_rq->highest_prio.curr; - - if (prio < prev_prio) - rt_rq->highest_prio.curr = prio; - - inc_rt_prio_smp(rt_rq, prio, prev_prio); -} - -static void -dec_rt_prio(struct rt_rq *rt_rq, int prio) -{ - int prev_prio = rt_rq->highest_prio.curr; - - if (rt_rq->rt_nr_running) { - - WARN_ON(prio < prev_prio); - - /* - * This may have been our highest task, and therefore - * we may have some recomputation to do - */ - if (prio == prev_prio) { - struct rt_prio_array *array = &rt_rq->active; - - rt_rq->highest_prio.curr = - sched_find_first_bit(array->bitmap); - } - - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; - - dec_rt_prio_smp(rt_rq, prio, prev_prio); -} - -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED - -static void -inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - if (rt_se_boosted(rt_se)) - rt_rq->rt_nr_boosted++; - - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); -} - -static void -dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - if (rt_se_boosted(rt_se)) - rt_rq->rt_nr_boosted--; - - WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); -} - -#else /* CONFIG_RT_GROUP_SCHED */ - -static void -inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - start_rt_bandwidth(&def_rt_bandwidth); -} - -static inline -void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} - -#endif /* CONFIG_RT_GROUP_SCHED */ - -static inline -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - int prio = rt_se_prio(rt_se); - - WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; - - inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); - inc_rt_group(rt_se, rt_rq); -} - -static inline -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - WARN_ON(!rt_prio(rt_se_prio(rt_se))); - WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; - - dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); - dec_rt_group(rt_se, rt_rq); -} - -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) -{ - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - struct rt_prio_array *array = &rt_rq->active; - struct rt_rq *group_rq = group_rt_rq(rt_se); - struct list_head *queue = array->queue + rt_se_prio(rt_se); - - /* - * Don't enqueue the group if its throttled, or when empty. - * The latter is a consequence of the former when a child group - * get throttled and the current group doesn't have any other - * active members. - */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) - return; - - if (!rt_rq->rt_nr_running) - list_add_leaf_rt_rq(rt_rq); - - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); - - inc_rt_tasks(rt_se, rt_rq); -} - -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) -{ - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - struct rt_prio_array *array = &rt_rq->active; - - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); - - dec_rt_tasks(rt_se, rt_rq); - if (!rt_rq->rt_nr_running) - list_del_leaf_rt_rq(rt_rq); -} - -/* - * Because the prio of an upper entry depends on the lower - * entries, we must remove entries top - down. - */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) -{ - struct sched_rt_entity *back = NULL; - - for_each_sched_rt_entity(rt_se) { - rt_se->back = back; - back = rt_se; - } - - for (rt_se = back; rt_se; rt_se = rt_se->back) { - if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); - } -} - -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) -{ - dequeue_rt_stack(rt_se); - for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); -} - -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) -{ - dequeue_rt_stack(rt_se); - - for_each_sched_rt_entity(rt_se) { - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); - } -} - -/* - * Adding/removing a task to/from a priority array: - */ -static void -enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) -{ - struct sched_rt_entity *rt_se = &p->rt; - - if (flags & ENQUEUE_WAKEUP) - rt_se->timeout = 0; - - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) - enqueue_pushable_task(rq, p); - - inc_nr_running(rq); -} - -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) -{ - struct sched_rt_entity *rt_se = &p->rt; - - update_curr_rt(rq); - dequeue_rt_entity(rt_se); - - dequeue_pushable_task(rq, p); - - dec_nr_running(rq); -} - -/* - * Put task to the head or the end of the run list without the overhead of - * dequeue followed by enqueue. - */ -static void -requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) -{ - if (on_rt_rq(rt_se)) { - struct rt_prio_array *array = &rt_rq->active; - struct list_head *queue = array->queue + rt_se_prio(rt_se); - - if (head) - list_move(&rt_se->run_list, queue); - else - list_move_tail(&rt_se->run_list, queue); - } -} - -static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) -{ - struct sched_rt_entity *rt_se = &p->rt; - struct rt_rq *rt_rq; - - for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); - requeue_rt_entity(rt_rq, rt_se, head); - } -} - -static void yield_task_rt(struct rq *rq) -{ - requeue_task_rt(rq, rq->curr, 0); -} - -#ifdef CONFIG_SMP -static int find_lowest_rq(struct task_struct *task); - -static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) -{ - struct task_struct *curr; - struct rq *rq; - int cpu; - - cpu = task_cpu(p); - - if (p->rt.nr_cpus_allowed == 1) - goto out; - - /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) - goto out; - - rq = cpu_rq(cpu); - - rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ - - /* - * If the current task on @p's runqueue is an RT task, then - * try to see if we can wake this RT task up on another - * runqueue. Otherwise simply start this RT task - * on its current runqueue. - * - * We want to avoid overloading runqueues. If the woken - * task is a higher priority, then it will stay on this CPU - * and the lower prio task should be moved to another CPU. - * Even though this will probably make the lower prio task - * lose its cache, we do not want to bounce a higher task - * around just because it gave up its CPU, perhaps for a - * lock? - * - * For equal prio tasks, we just let the scheduler sort it out. - * - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - * - * This test is optimistic, if we get it wrong the load-balancer - * will have to sort it out. - */ - if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { - int target = find_lowest_rq(p); - - if (target != -1) - cpu = target; - } - rcu_read_unlock(); - -out: - return cpu; -} - -static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) -{ - if (rq->curr->rt.nr_cpus_allowed == 1) - return; - - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) - return; - - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - - /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: - */ - requeue_task_rt(rq, p, 1); - resched_task(rq->curr); -} - -#endif /* CONFIG_SMP */ - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) -{ - if (p->prio < rq->curr->prio) { - resched_task(rq->curr); - return; - } - -#ifdef CONFIG_SMP - /* - * If: - * - * - the newly woken task is of equal priority to the current task - * - the newly woken task is non-migratable while current is migratable - * - current will be preempted on the next reschedule - * - * we should check to see if current can readily move to a different - * cpu. If so, we will reschedule to allow the push logic to try - * to move current somewhere else, making room for our non-migratable - * task. - */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) - check_preempt_equal_prio(rq, p); -#endif -} - -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) -{ - struct rt_prio_array *array = &rt_rq->active; - struct sched_rt_entity *next = NULL; - struct list_head *queue; - int idx; - - idx = sched_find_first_bit(array->bitmap); - BUG_ON(idx >= MAX_RT_PRIO); - - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); - - return next; -} - -static struct task_struct *_pick_next_task_rt(struct rq *rq) -{ - struct sched_rt_entity *rt_se; - struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; - - do { - rt_se = pick_next_rt_entity(rq, rt_rq); - BUG_ON(!rt_se); - rt_rq = group_rt_rq(rt_se); - } while (rt_rq); - - p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; - - return p; -} - -static struct task_struct *pick_next_task_rt(struct rq *rq) -{ - struct task_struct *p = _pick_next_task_rt(rq); - - /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); - -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif - - return p; -} - -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) -{ - update_curr_rt(rq); - - /* - * The previous task needs to be made eligible for pushing - * if it is still active - */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) - enqueue_pushable_task(rq, p); -} - -#ifdef CONFIG_SMP - -/* Only try algorithms three times */ -#define RT_MAX_TRIES 3 - -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) - return 1; - return 0; -} - -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) -{ - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; - - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); -next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio < idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - continue; - - p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } - } - - return next; -} - -static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); - -static int find_lowest_rq(struct task_struct *task) -{ - struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); - int this_cpu = smp_processor_id(); - int cpu = task_cpu(task); - - /* Make sure the mask is initialized first */ - if (unlikely(!lowest_mask)) - return -1; - - if (task->rt.nr_cpus_allowed == 1) - return -1; /* No other targets possible */ - - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) - return -1; /* No targets found */ - - /* - * At this point we have built a mask of cpus representing the - * lowest priority tasks in the system. Now we want to elect - * the best one based on our affinity and topology. - * - * We prioritize the last cpu that the task executed on since - * it is most likely cache-hot in that location. - */ - if (cpumask_test_cpu(cpu, lowest_mask)) - return cpu; - - /* - * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. - */ - if (!cpumask_test_cpu(this_cpu, lowest_mask)) - this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - - rcu_read_lock(); - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; - - /* - * "this_cpu" is cheaper to preempt than a - * remote processor. - */ - if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { - rcu_read_unlock(); - return this_cpu; - } - - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) { - rcu_read_unlock(); - return best_cpu; - } - } - } - rcu_read_unlock(); - - /* - * And finally, if there were no matches within the domains - * just give the caller *something* to work with from the compatible - * locations. - */ - if (this_cpu != -1) - return this_cpu; - - cpu = cpumask_any(lowest_mask); - if (cpu < nr_cpu_ids) - return cpu; - return -1; -} - -/* Will lock the rq it finds */ -static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) -{ - struct rq *lowest_rq = NULL; - int tries; - int cpu; - - for (tries = 0; tries < RT_MAX_TRIES; tries++) { - cpu = find_lowest_rq(task); - - if ((cpu == -1) || (cpu == rq->cpu)) - break; - - lowest_rq = cpu_rq(cpu); - - /* if the prio of this runqueue changed, try again */ - if (double_lock_balance(rq, lowest_rq)) { - /* - * We had to unlock the run queue. In - * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. - */ - if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, - tsk_cpus_allowed(task)) || - task_running(rq, task) || - !task->on_rq)) { - - raw_spin_unlock(&lowest_rq->lock); - lowest_rq = NULL; - break; - } - } - - /* If this rq is still suitable use it. */ - if (lowest_rq->rt.highest_prio.curr > task->prio) - break; - - /* try again */ - double_unlock_balance(rq, lowest_rq); - lowest_rq = NULL; - } - - return lowest_rq; -} - -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); - - BUG_ON(!p->on_rq); - BUG_ON(!rt_task(p)); - - return p; -} - -/* - * If the current CPU has more than one RT task, see if the non - * running task can migrate over to a CPU that is running a task - * of lesser priority. - */ -static int push_rt_task(struct rq *rq) -{ - struct task_struct *next_task; - struct rq *lowest_rq; - int ret = 0; - - if (!rq->rt.overloaded) - return 0; - - next_task = pick_next_pushable_task(rq); - if (!next_task) - return 0; - -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - -retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); - return 0; - } - - /* - * It's possible that the next_task slipped in of - * higher priority than current. If that's the case - * just reschedule current. - */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); - return 0; - } - - /* We might release rq lock */ - get_task_struct(next_task); - - /* find_lock_lowest_rq locks the rq if found */ - lowest_rq = find_lock_lowest_rq(next_task, rq); - if (!lowest_rq) { - struct task_struct *task; - /* - * find_lock_lowest_rq releases rq->lock - * so it is possible that next_task has migrated. - * - * We need to make sure that the task is still on the same - * run-queue and is also still the next task eligible for - * pushing. - */ - task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { - /* - * The task hasn't migrated, and is still the next - * eligible task, but we failed to find a run-queue - * to push it to. Do not retry in this case, since - * other cpus will pull from us when ready. - */ - goto out; - } - - if (!task) - /* No more tasks, just exit */ - goto out; - - /* - * Something has shifted, try again. - */ - put_task_struct(next_task); - next_task = task; - goto retry; - } - - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); - ret = 1; - - resched_task(lowest_rq->curr); - - double_unlock_balance(rq, lowest_rq); - -out: - put_task_struct(next_task); - - return ret; -} - -static void push_rt_tasks(struct rq *rq) -{ - /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) - ; -} - -static int pull_rt_task(struct rq *this_rq) -{ - int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p; - struct rq *src_rq; - - if (likely(!rt_overloaded(this_rq))) - return 0; - - for_each_cpu(cpu, this_rq->rd->rto_mask) { - if (this_cpu == cpu) - continue; - - src_rq = cpu_rq(cpu); - - /* - * Don't bother taking the src_rq->lock if the next highest - * task is known to be lower-priority than our current task. - * This may look racy, but if this value is about to go - * logically higher, the src_rq will push this task away. - * And if its going logically lower, we do not care - */ - if (src_rq->rt.highest_prio.next >= - this_rq->rt.highest_prio.curr) - continue; - - /* - * We can potentially drop this_rq's lock in - * double_lock_balance, and another CPU could - * alter this_rq - */ - double_lock_balance(this_rq, src_rq); - - /* - * Are there still pullable RT tasks? - */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); - - /* - * Do we have an RT task that preempts - * the to-be-scheduled task? - */ - if (p && (p->prio < this_rq->rt.highest_prio.curr)) { - WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); - - /* - * There's a chance that p is higher in priority - * than what's currently running on its cpu. - * This is just that p is wakeing up and hasn't - * had a chance to schedule. We only pull - * p if it is lower in priority than the - * current task on the run queue - */ - if (p->prio < src_rq->curr->prio) - goto skip; - - ret = 1; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - /* - * We continue with the search, just in - * case there's an even higher prio task - * in another runqueue. (low likelihood - * but possible) - */ - } -skip: - double_unlock_balance(this_rq, src_rq); - } - - return ret; -} - -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - -static void post_schedule_rt(struct rq *rq) -{ - push_rt_tasks(rq); -} - -/* - * If we are not running and we are not going to reschedule soon, we should - * try to push tasks away now - */ -static void task_woken_rt(struct rq *rq, struct task_struct *p) -{ - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && - rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) - push_rt_tasks(rq); -} - -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - int weight = cpumask_weight(new_mask); - - BUG_ON(!rt_task(p)); - - /* - * Update the migration status of the RQ if we have an RT task - * which is running AND changing its weight value. - */ - if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { - struct rq *rq = task_rq(p); - - if (!task_current(rq, p)) { - /* - * Make sure we dequeue this task from the pushable list - * before going further. It will either remain off of - * the list because we are no longer pushable, or it - * will be requeued. - */ - if (p->rt.nr_cpus_allowed > 1) - dequeue_pushable_task(rq, p); - - /* - * Requeue if our weight is changing and still > 1 - */ - if (weight > 1) - enqueue_pushable_task(rq, p); - - } - - if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { - rq->rt.rt_nr_migratory++; - } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } - - update_rt_migration(&rq->rt); - } -} - -/* Assumes rq->lock is held */ -static void rq_online_rt(struct rq *rq) -{ - if (rq->rt.overloaded) - rt_set_overload(rq); - - __enable_runtime(rq); - - cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); -} - -/* Assumes rq->lock is held */ -static void rq_offline_rt(struct rq *rq) -{ - if (rq->rt.overloaded) - rt_clear_overload(rq); - - __disable_runtime(rq); - - cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); -} - -/* - * When switch from the rt queue, we bring ourselves to a position - * that we might want to pull RT tasks from other runqueues. - */ -static void switched_from_rt(struct rq *rq, struct task_struct *p) -{ - /* - * If there are other RT tasks then we will reschedule - * and the scheduling of the other RT tasks will handle - * the balancing. But if we are the last RT task - * we may need to handle the pulling of RT tasks - * now. - */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); -} - -void init_sched_rt_class(void) -{ - unsigned int i; - - for_each_possible_cpu(i) { - zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), - GFP_KERNEL, cpu_to_node(i)); - } -} -#endif /* CONFIG_SMP */ - -/* - * When switching a task to RT, we may overload the runqueue - * with RT tasks. In this case we try to push them off to - * other runqueues. - */ -static void switched_to_rt(struct rq *rq, struct task_struct *p) -{ - int check_resched = 1; - - /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task - * then see if we can move to another run queue. - */ - if (p->on_rq && rq->curr != p) { -#ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && - /* Don't resched if we changed runqueues */ - rq != task_rq(p)) - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); - } -} - -/* - * Priority of the task has changed. This may cause - * us to initiate a push or pull. - */ -static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) -{ - if (!p->on_rq) - return; - - if (rq->curr == p) { -#ifdef CONFIG_SMP - /* - * If our priority decreases while running, we - * may need to pull tasks to this runqueue. - */ - if (oldprio < p->prio) - pull_rt_task(rq); - /* - * If there's a higher priority task waiting to run - * then reschedule. Note, the above pull_rt_task - * can release the rq lock and p could migrate. - * Only reschedule if p is still on the same runqueue. - */ - if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_task(p); -#endif /* CONFIG_SMP */ - } else { - /* - * This task is not running, but if it is - * greater than the current running task - * then reschedule. - */ - if (p->prio < rq->curr->prio) - resched_task(rq->curr); - } -} - -static void watchdog(struct rq *rq, struct task_struct *p) -{ - unsigned long soft, hard; - - /* max may change after cur was read, this will be fixed next tick */ - soft = task_rlimit(p, RLIMIT_RTTIME); - hard = task_rlimit_max(p, RLIMIT_RTTIME); - - if (soft != RLIM_INFINITY) { - unsigned long next; - - p->rt.timeout++; - next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); - if (p->rt.timeout > next) - p->cputime_expires.sched_exp = p->se.sum_exec_runtime; - } -} - -static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) -{ - update_curr_rt(rq); - - watchdog(rq, p); - - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if (p->policy != SCHED_RR) - return; - - if (--p->rt.time_slice) - return; - - p->rt.time_slice = DEF_TIMESLICE; - - /* - * Requeue to the end of queue if we are not the only element - * on the queue: - */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); - } -} - -static void set_curr_task_rt(struct rq *rq) -{ - struct task_struct *p = rq->curr; - - p->se.exec_start = rq->clock_task; - - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); -} - -static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) -{ - /* - * Time slice is 0 for SCHED_FIFO tasks - */ - if (task->policy == SCHED_RR) - return DEF_TIMESLICE; - else - return 0; -} - -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, - .enqueue_task = enqueue_task_rt, - .dequeue_task = dequeue_task_rt, - .yield_task = yield_task_rt, - - .check_preempt_curr = check_preempt_curr_rt, - - .pick_next_task = pick_next_task_rt, - .put_prev_task = put_prev_task_rt, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_rt, - - .set_cpus_allowed = set_cpus_allowed_rt, - .rq_online = rq_online_rt, - .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, - .post_schedule = post_schedule_rt, - .task_woken = task_woken_rt, - .switched_from = switched_from_rt, -#endif - - .set_curr_task = set_curr_task_rt, - .task_tick = task_tick_rt, - - .get_rr_interval = get_rr_interval_rt, - - .prio_changed = prio_changed_rt, - .switched_to = switched_to_rt, -}; - -#ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); - -void print_rt_stats(struct seq_file *m, int cpu) -{ - rt_rq_iter_t iter; - struct rt_rq *rt_rq; - - rcu_read_lock(); - for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) - print_rt_rq(m, cpu, rt_rq); - rcu_read_unlock(); -} -#endif /* CONFIG_SCHED_DEBUG */ - -#include -#include -#include -#include - -#include "sched.h" - -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 15 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcount = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", - cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, - rq->ttwu_count, rq->ttwu_local, - rq->rq_cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { - enum cpu_idle_type itype; - - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", - sd->lb_count[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, - " %u %u %u %u %u %u %u %u %u %u %u %u\n", - sd->alb_count, sd->alb_failed, sd->alb_pushed, - sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - rcu_read_unlock(); -#endif - } - kfree(mask_str); - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_schedstat_init(void) -{ - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - return 0; -} -module_init(proc_schedstat_init); -#include "sched.h" - -/* - * stop-task scheduling class. - * - * The stop task is the highest priority task in the system, it preempts - * everything and will be preempted by nothing. - * - * See kernel/stop_machine.c - */ - -#ifdef CONFIG_SMP -static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) -{ - return task_cpu(p); /* stop tasks as never migrate */ -} -#endif /* CONFIG_SMP */ - -static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) -{ - /* we're never preempted */ -} - -static struct task_struct *pick_next_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - if (stop && stop->on_rq) - return stop; - - return NULL; -} - -static void -enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) -{ - inc_nr_running(rq); -} - -static void -dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) -{ - dec_nr_running(rq); -} - -static void yield_task_stop(struct rq *rq) -{ - BUG(); /* the stop task should never yield, its pointless. */ -} - -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) -{ -} - -static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) -{ -} - -static void set_curr_task_stop(struct rq *rq) -{ -} - -static void switched_to_stop(struct rq *rq, struct task_struct *p) -{ - BUG(); /* its impossible to change to this class */ -} - -static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) -{ - BUG(); /* how!?, what priority? */ -} - -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - -/* - * Simple, special scheduling class for the per-CPU stop tasks: - */ -const struct sched_class stop_sched_class = { - .next = &rt_sched_class, - - .enqueue_task = enqueue_task_stop, - .dequeue_task = dequeue_task_stop, - .yield_task = yield_task_stop, - - .check_preempt_curr = check_preempt_curr_stop, - - .pick_next_task = pick_next_task_stop, - .put_prev_task = put_prev_task_stop, - -#ifdef CONFIG_SMP - .select_task_rq = select_task_rq_stop, -#endif - - .set_curr_task = set_curr_task_stop, - .task_tick = task_tick_stop, - - .get_rr_interval = get_rr_interval_stop, - - .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, -}; -/* - * linux/kernel/seccomp.c - * - * Copyright 2004-2005 Andrea Arcangeli - * - * This defines a simple but solid secure-computing mode. - */ - -#include -#include -#include -#include - -/* #define SECCOMP_DEBUG 1 */ -#define NR_SECCOMP_MODES 1 - -/* - * Secure computing mode 1 allows only read/write/exit/sigreturn. - * To be fully secure this must be combined with rlimit - * to limit the stack allocations too. - */ -static int mode1_syscalls[] = { - __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ -}; - -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - -void __secure_computing(int this_syscall) -{ - int mode = current->seccomp.mode; - int * syscall; - - switch (mode) { - case 1: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; -#endif - do { - if (*syscall == this_syscall) - return; - } while (*++syscall); - break; - default: - BUG(); - } - -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - audit_seccomp(this_syscall); - do_exit(SIGKILL); -} - -long prctl_get_seccomp(void) -{ - return current->seccomp.mode; -} - -long prctl_set_seccomp(unsigned long seccomp_mode) -{ - long ret; - - /* can set it only once to be even more secure */ - ret = -EPERM; - if (unlikely(current->seccomp.mode)) - goto out; - - ret = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); -#ifdef TIF_NOTSC - disable_TSC(); -#endif - ret = 0; - } - - out: - return ret; -} -/* - * Copyright (c) 2008 Intel Corporation - * Author: Matthew Wilcox - * - * Distributed under the terms of the GNU GPL, version 2 - * - * This file implements counting semaphores. - * A counting semaphore may be acquired 'n' times before sleeping. - * See mutex.c for single-acquisition sleeping locks which enforce - * rules which allow code to be debugged more easily. - */ - -/* - * Some notes on the implementation: - * - * The spinlock controls access to the other members of the semaphore. - * down_trylock() and up() can be called from interrupt context, so we - * have to disable interrupts when taking the lock. It turns out various - * parts of the kernel expect to be able to use down() on a semaphore in - * interrupt context when they know it will succeed, so we have to use - * irqsave variants for down(), down_interruptible() and down_killable() - * too. - * - * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. - */ - -#include -#include -#include -#include -#include -#include -#include - -static noinline void __down(struct semaphore *sem); -static noinline int __down_interruptible(struct semaphore *sem); -static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); -static noinline void __up(struct semaphore *sem); - -/** - * down - acquire the semaphore - * @sem: the semaphore to be acquired - * - * Acquires the semaphore. If no more tasks are allowed to acquire the - * semaphore, calling this function will put the task to sleep until the - * semaphore is released. - * - * Use of this function is deprecated, please use down_interruptible() or - * down_killable() instead. - */ -void down(struct semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - __down(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(down); - -/** - * down_interruptible - acquire the semaphore unless interrupted - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a signal, this function will return -EINTR. - * If the semaphore is successfully acquired, this function returns 0. - */ -int down_interruptible(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_interruptible(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_interruptible); - -/** - * down_killable - acquire the semaphore unless killed - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a fatal signal, this function will return - * -EINTR. If the semaphore is successfully acquired, this function returns - * 0. - */ -int down_killable(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_killable(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_killable); - -/** - * down_trylock - try to acquire the semaphore, without waiting - * @sem: the semaphore to be acquired - * - * Try to acquire the semaphore atomically. Returns 0 if the mutex has - * been acquired successfully or 1 if it it cannot be acquired. - * - * NOTE: This return value is inverted from both spin_trylock and - * mutex_trylock! Be careful about this when converting code. - * - * Unlike mutex_trylock, this function can be used from interrupt context, - * and the semaphore can be released by any task or interrupt. - */ -int down_trylock(struct semaphore *sem) -{ - unsigned long flags; - int count; - - raw_spin_lock_irqsave(&sem->lock, flags); - count = sem->count - 1; - if (likely(count >= 0)) - sem->count = count; - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return (count < 0); -} -EXPORT_SYMBOL(down_trylock); - -/** - * down_timeout - acquire the semaphore within a specified time - * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the semaphore is not released within the specified number of jiffies, - * this function returns -ETIME. It returns 0 if the semaphore was acquired. - */ -int down_timeout(struct semaphore *sem, long jiffies) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_timeout(sem, jiffies); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_timeout); - -/** - * up - release the semaphore - * @sem: the semaphore to release - * - * Release the semaphore. Unlike mutexes, up() may be called from any - * context and even by tasks which have never called down(). - */ -void up(struct semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(list_empty(&sem->wait_list))) - sem->count++; - else - __up(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(up); - -/* Functions for the contended case */ - -struct semaphore_waiter { - struct list_head list; - struct task_struct *task; - int up; -}; - -/* - * Because this function is inlined, the 'state' parameter will be - * constant, and thus optimised away by the compiler. Likewise the - * 'timeout' parameter for the cases without timeouts. - */ -static inline int __sched __down_common(struct semaphore *sem, long state, - long timeout) -{ - struct task_struct *task = current; - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; - waiter.up = 0; - - for (;;) { - if (signal_pending_state(state, task)) - goto interrupted; - if (timeout <= 0) - goto timed_out; - __set_task_state(task, state); - raw_spin_unlock_irq(&sem->lock); - timeout = schedule_timeout(timeout); - raw_spin_lock_irq(&sem->lock); - if (waiter.up) - return 0; - } - - timed_out: - list_del(&waiter.list); - return -ETIME; - - interrupted: - list_del(&waiter.list); - return -EINTR; -} - -static noinline void __sched __down(struct semaphore *sem) -{ - __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_interruptible(struct semaphore *sem) -{ - return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_killable(struct semaphore *sem) -{ - return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) -{ - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); -} - -static noinline void __sched __up(struct semaphore *sem) -{ - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); - waiter->up = 1; - wake_up_process(waiter->task); -} -/* - * linux/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson - * - * 2003-06-02 Jim Houston - Concurrent Computer Corp. - * Changes to use preallocated sigqueue structures - * to allow signals to be sent reliably. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define CREATE_TRACE_POINTS -#include - -#include -#include -#include -#include -#include "audit.h" /* audit_signal_info() */ - -/* - * SLAB caches for signal bits. - */ - -static struct kmem_cache *sigqueue_cachep; - -int print_fatal_signals __read_mostly; - -static void __user *sig_handler(struct task_struct *t, int sig) -{ - return t->sighand->action[sig - 1].sa.sa_handler; -} - -static int sig_handler_ignored(void __user *handler, int sig) -{ - /* Is it explicitly or implicitly ignored? */ - return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); -} - -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) -{ - void __user *handler; - - handler = sig_handler(t, sig); - - if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) - return 1; - - return sig_handler_ignored(handler, sig); -} - -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) -{ - /* - * Blocked signals are never ignored, since the - * signal handler may change by the time it is - * unblocked. - */ - if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; - - if (!sig_task_ignored(t, sig, from_ancestor_ns)) - return 0; - - /* - * Tracers may want to know about even ignored signals. - */ - return !t->ptrace; -} - -/* - * Re-calculate pending state from the set of locally pending - * signals, globally pending signals, and blocked signals. - */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) -{ - unsigned long ready; - long i; - - switch (_NSIG_WORDS) { - default: - for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) - ready |= signal->sig[i] &~ blocked->sig[i]; - break; - - case 4: ready = signal->sig[3] &~ blocked->sig[3]; - ready |= signal->sig[2] &~ blocked->sig[2]; - ready |= signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 2: ready = signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 1: ready = signal->sig[0] &~ blocked->sig[0]; - } - return ready != 0; -} - -#define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) - -static int recalc_sigpending_tsk(struct task_struct *t) -{ - if ((t->jobctl & JOBCTL_PENDING_MASK) || - PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; - } - /* - * We must never clear the flag in another thread, or in current - * when it's possible the current syscall is returning -ERESTART*. - * So we don't clear it here, and only callers who know they should do. - */ - return 0; -} - -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - -void recalc_sigpending(void) -{ - if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - -} - -/* Given the mask, find the first available signal that should be serviced. */ - -#define SYNCHRONOUS_MASK \ - (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ - sigmask(SIGTRAP) | sigmask(SIGFPE)) - -int next_signal(struct sigpending *pending, sigset_t *mask) -{ - unsigned long i, *s, *m, x; - int sig = 0; - - s = pending->signal.sig; - m = mask->sig; - - /* - * Handle the first word specially: it contains the - * synchronous signals that need to be dequeued first. - */ - x = *s &~ *m; - if (x) { - if (x & SYNCHRONOUS_MASK) - x &= SYNCHRONOUS_MASK; - sig = ffz(~x) + 1; - return sig; - } - - switch (_NSIG_WORDS) { - default: - for (i = 1; i < _NSIG_WORDS; ++i) { - x = *++s &~ *++m; - if (!x) - continue; - sig = ffz(~x) + i*_NSIG_BPW + 1; - break; - } - break; - - case 2: - x = s[1] &~ m[1]; - if (!x) - break; - sig = ffz(~x) + _NSIG_BPW + 1; - break; - - case 1: - /* Nothing to do */ - break; - } - - return sig; -} - -static inline void print_dropped_signal(int sig) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!print_fatal_signals) - return; - - if (!__ratelimit(&ratelimit_state)) - return; - - printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", - current->comm, current->pid, sig); -} - -/** - * task_set_jobctl_pending - set jobctl pending bits - * @task: target task - * @mask: pending bits to set - * - * Clear @mask from @task->jobctl. @mask must be subset of - * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | - * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is - * cleared. If @task is already being killed or exiting, this function - * becomes noop. - * - * CONTEXT: - * Must be called with @task->sighand->siglock held. - * - * RETURNS: - * %true if @mask is set, %false if made noop because @task was dying. - */ -bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) -{ - BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | - JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); - BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); - - if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) - return false; - - if (mask & JOBCTL_STOP_SIGMASK) - task->jobctl &= ~JOBCTL_STOP_SIGMASK; - - task->jobctl |= mask; - return true; -} - -/** - * task_clear_jobctl_trapping - clear jobctl trapping bit - * @task: target task - * - * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. - * Clear it and wake up the ptracer. Note that we don't need any further - * locking. @task->siglock guarantees that @task->parent points to the - * ptracer. - * - * CONTEXT: - * Must be called with @task->sighand->siglock held. - */ -void task_clear_jobctl_trapping(struct task_struct *task) -{ - if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { - task->jobctl &= ~JOBCTL_TRAPPING; - wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); - } -} - -/** - * task_clear_jobctl_pending - clear jobctl pending bits - * @task: target task - * @mask: pending bits to clear - * - * Clear @mask from @task->jobctl. @mask must be subset of - * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other - * STOP bits are cleared together. - * - * If clearing of @mask leaves no stop or trap pending, this function calls - * task_clear_jobctl_trapping(). - * - * CONTEXT: - * Must be called with @task->sighand->siglock held. - */ -void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) -{ - BUG_ON(mask & ~JOBCTL_PENDING_MASK); - - if (mask & JOBCTL_STOP_PENDING) - mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; - - task->jobctl &= ~mask; - - if (!(task->jobctl & JOBCTL_PENDING_MASK)) - task_clear_jobctl_trapping(task); -} - -/** - * task_participate_group_stop - participate in a group stop - * @task: task participating in a group stop - * - * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. - * Group stop states are cleared and the group stop count is consumed if - * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group - * stop, the appropriate %SIGNAL_* flags are set. - * - * CONTEXT: - * Must be called with @task->sighand->siglock held. - * - * RETURNS: - * %true if group stop completion should be notified to the parent, %false - * otherwise. - */ -static bool task_participate_group_stop(struct task_struct *task) -{ - struct signal_struct *sig = task->signal; - bool consume = task->jobctl & JOBCTL_STOP_CONSUME; - - WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); - - task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); - - if (!consume) - return false; - - if (!WARN_ON_ONCE(sig->group_stop_count == 0)) - sig->group_stop_count--; - - /* - * Tell the caller to notify completion iff we are entering into a - * fresh group stop. Read comment in do_signal_stop() for details. - */ - if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; - return true; - } - return false; -} - -/* - * allocate a new signal queue record - * - this may be called without locks if and only if t == current, otherwise an - * appropriate lock must be held to stop the target task from exiting - */ -static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) -{ - struct sigqueue *q = NULL; - struct user_struct *user; - - /* - * Protect access to @t credentials. This can go away when all - * callers hold rcu read lock. - */ - rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); - rcu_read_unlock(); - - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { - q = kmem_cache_alloc(sigqueue_cachep, flags); - } else { - print_dropped_signal(sig); - } - - if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); - } else { - INIT_LIST_HEAD(&q->list); - q->flags = 0; - q->user = user; - } - - return q; -} - -static void __sigqueue_free(struct sigqueue *q) -{ - if (q->flags & SIGQUEUE_PREALLOC) - return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); - kmem_cache_free(sigqueue_cachep, q); -} - -void flush_sigqueue(struct sigpending *queue) -{ - struct sigqueue *q; - - sigemptyset(&queue->signal); - while (!list_empty(&queue->list)) { - q = list_entry(queue->list.next, struct sigqueue , list); - list_del_init(&q->list); - __sigqueue_free(q); - } -} - -/* - * Flush all pending signals for a task. - */ -void __flush_signals(struct task_struct *t) -{ - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); -} - -void flush_signals(struct task_struct *t) -{ - unsigned long flags; - - spin_lock_irqsave(&t->sighand->siglock, flags); - __flush_signals(t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); -} - -static void __flush_itimer_signals(struct sigpending *pending) -{ - sigset_t signal, retain; - struct sigqueue *q, *n; - - signal = pending->signal; - sigemptyset(&retain); - - list_for_each_entry_safe(q, n, &pending->list, list) { - int sig = q->info.si_signo; - - if (likely(q->info.si_code != SI_TIMER)) { - sigaddset(&retain, sig); - } else { - sigdelset(&signal, sig); - list_del_init(&q->list); - __sigqueue_free(q); - } - } - - sigorsets(&pending->signal, &signal, &retain); -} - -void flush_itimer_signals(void) -{ - struct task_struct *tsk = current; - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - __flush_itimer_signals(&tsk->pending); - __flush_itimer_signals(&tsk->signal->shared_pending); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); -} - -void ignore_signals(struct task_struct *t) -{ - int i; - - for (i = 0; i < _NSIG; ++i) - t->sighand->action[i].sa.sa_handler = SIG_IGN; - - flush_signals(t); -} - -/* - * Flush all handlers for a task. - */ - -void -flush_signal_handlers(struct task_struct *t, int force_default) -{ - int i; - struct k_sigaction *ka = &t->sighand->action[0]; - for (i = _NSIG ; i != 0 ; i--) { - if (force_default || ka->sa.sa_handler != SIG_IGN) - ka->sa.sa_handler = SIG_DFL; - ka->sa.sa_flags = 0; - sigemptyset(&ka->sa.sa_mask); - ka++; - } -} - -int unhandled_signal(struct task_struct *tsk, int sig) -{ - void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; - if (is_global_init(tsk)) - return 1; - if (handler != SIG_IGN && handler != SIG_DFL) - return 0; - /* if ptraced, let the tracer determine */ - return !tsk->ptrace; -} - -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) -{ - struct sigqueue *q, *first = NULL; - - /* - * Collect the siginfo appropriate to this signal. Check if - * there is another siginfo for the same signal. - */ - list_for_each_entry(q, &list->list, list) { - if (q->info.si_signo == sig) { - if (first) - goto still_pending; - first = q; - } - } - - sigdelset(&list->signal, sig); - - if (first) { -still_pending: - list_del_init(&first->list); - copy_siginfo(info, &first->info); - __sigqueue_free(first); - } else { - /* - * Ok, it wasn't in the queue. This must be - * a fast-pathed signal or we must have been - * out of queue space. So zero out the info. - */ - info->si_signo = sig; - info->si_errno = 0; - info->si_code = SI_USER; - info->si_pid = 0; - info->si_uid = 0; - } -} - -static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) -{ - int sig = next_signal(pending, mask); - - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - - collect_signal(sig, pending, info); - } - - return sig; -} - -/* - * Dequeue a signal and return the element to the caller, which is - * expected to free it. - * - * All callers have to hold the siglock. - */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) -{ - int signr; - - /* We only dequeue private signals from ourselves, we don't let - * signalfd steal them - */ - signr = __dequeue_signal(&tsk->pending, mask, info); - if (!signr) { - signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); - /* - * itimer signal ? - * - * itimers are process shared and we restart periodic - * itimers in the signal delivery path to prevent DoS - * attacks in the high resolution timer case. This is - * compliant with the old way of self-restarting - * itimers, as the SIGALRM is a legacy signal and only - * queued once. Changing the restart behaviour to - * restart the timer in the signal dequeue path is - * reducing the timer noise on heavy loaded !highres - * systems too. - */ - if (unlikely(signr == SIGALRM)) { - struct hrtimer *tmr = &tsk->signal->real_timer; - - if (!hrtimer_is_queued(tmr) && - tsk->signal->it_real_incr.tv64 != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); - hrtimer_restart(tmr); - } - } - } - - recalc_sigpending(); - if (!signr) - return 0; - - if (unlikely(sig_kernel_stop(signr))) { - /* - * Set a marker that we have dequeued a stop signal. Our - * caller might release the siglock and then the pending - * stop signal it is about to process is no longer in the - * pending bitmasks, but must still be cleared by a SIGCONT - * (and overruled by a SIGKILL). So those cases clear this - * shared flag after we've set it. Note that this flag may - * remain set after the signal we return is ignored or - * handled. That doesn't matter because its only purpose - * is to alert stop-signal processing code when another - * processor has come along and cleared the flag. - */ - current->jobctl |= JOBCTL_STOP_DEQUEUED; - } - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { - /* - * Release the siglock to ensure proper locking order - * of timer locks outside of siglocks. Note, we leave - * irqs disabled here, since the posix-timers code is - * about to disable them again anyway. - */ - spin_unlock(&tsk->sighand->siglock); - do_schedule_next_timer(info); - spin_lock(&tsk->sighand->siglock); - } - return signr; -} - -/* - * Tell a process that it has a new active signal.. - * - * NOTE! we rely on the previous spin_lock to - * lock interrupts for us! We can only be called with - * "siglock" held, and the local interrupt must - * have been disabled when that got acquired! - * - * No need to set need_resched since signal event passing - * goes through ->blocked - */ -void signal_wake_up(struct task_struct *t, int resume) -{ - unsigned int mask; - - set_tsk_thread_flag(t, TIF_SIGPENDING); - - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable - * case. We don't check t->state here because there is a race with it - * executing another processor and just now entering stopped state. - * By using wake_up_state, we ensure the process will wake up and - * handle its death signal. - */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) - kick_process(t); -} - -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. - */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - sigset_t m; - - sigandsets(&m, mask, &s->signal); - if (sigisemptyset(&m)) - return 0; - - sigandnsets(&s->signal, &s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (sigismember(mask, q->info.si_signo)) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} - -static inline int is_si_special(const struct siginfo *info) -{ - return info <= SEND_SIG_FORCED; -} - -static inline bool si_fromuser(const struct siginfo *info) -{ - return info == SEND_SIG_NOINFO || - (!is_si_special(info) && SI_FROMUSER(info)); -} - -/* - * called with RCU read lock from check_kill_permission() - */ -static int kill_ok_by_cred(struct task_struct *t) -{ - const struct cred *cred = current_cred(); - const struct cred *tcred = __task_cred(t); - - if (cred->user->user_ns == tcred->user->user_ns && - (cred->euid == tcred->suid || - cred->euid == tcred->uid || - cred->uid == tcred->suid || - cred->uid == tcred->uid)) - return 1; - - if (ns_capable(tcred->user->user_ns, CAP_KILL)) - return 1; - - return 0; -} - -/* - * Bad permissions for sending the signal - * - the caller must hold the RCU read lock - */ -static int check_kill_permission(int sig, struct siginfo *info, - struct task_struct *t) -{ - struct pid *sid; - int error; - - if (!valid_signal(sig)) - return -EINVAL; - - if (!si_fromuser(info)) - return 0; - - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - - if (!same_thread_group(current, t) && - !kill_ok_by_cred(t)) { - switch (sig) { - case SIGCONT: - sid = task_session(t); - /* - * We don't return the error if sid == NULL. The - * task was unhashed, the caller must notice this. - */ - if (!sid || sid == task_session(current)) - break; - default: - return -EPERM; - } - } - - return security_task_kill(t, info, sig, 0); -} - -/** - * ptrace_trap_notify - schedule trap to notify ptracer - * @t: tracee wanting to notify tracer - * - * This function schedules sticky ptrace trap which is cleared on the next - * TRAP_STOP to notify ptracer of an event. @t must have been seized by - * ptracer. - * - * If @t is running, STOP trap will be taken. If trapped for STOP and - * ptracer is listening for events, tracee is woken up so that it can - * re-trap for the new event. If trapped otherwise, STOP trap will be - * eventually taken without returning to userland after the existing traps - * are finished by PTRACE_CONT. - * - * CONTEXT: - * Must be called with @task->sighand->siglock held. - */ -static void ptrace_trap_notify(struct task_struct *t) -{ - WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); - assert_spin_locked(&t->sighand->siglock); - - task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); -} - -/* - * Handle magic process-wide effects of stop/continue signals. Unlike - * the signal actions, these happen immediately at signal-generation - * time regardless of blocking, ignoring, or handling. This does the - * actual continuing for SIGCONT, but not the actual stopping for stop - * signals. The process stop is done as a signal action for SIG_DFL. - * - * Returns true if the signal should be actually delivered, otherwise - * it should be dropped. - */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) -{ - struct signal_struct *signal = p->signal; - struct task_struct *t; - - if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { - /* - * The process is in the middle of dying, nothing to do. - */ - } else if (sig_kernel_stop(sig)) { - /* - * This is a stop signal. Remove SIGCONT from all queues. - */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); - } else if (sig == SIGCONT) { - unsigned int why; - /* - * Remove all stop signals from all queues, wake all threads. - */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { - task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - if (likely(!(t->ptrace & PT_SEIZED))) - wake_up_state(t, __TASK_STOPPED); - else - ptrace_trap_notify(t); - } while_each_thread(p, t); - - /* - * Notify the parent with CLD_CONTINUED if we were stopped. - * - * If we were in the middle of a group stop, we pretend it - * was already finished, and then continued. Since SIGCHLD - * doesn't queue we report only CLD_STOPPED, as if the next - * CLD_CONTINUED was dropped. - */ - why = 0; - if (signal->flags & SIGNAL_STOP_STOPPED) - why |= SIGNAL_CLD_CONTINUED; - else if (signal->group_stop_count) - why |= SIGNAL_CLD_STOPPED; - - if (why) { - /* - * The first thread which returns from do_signal_stop() - * will take ->siglock, notice SIGNAL_CLD_MASK, and - * notify its parent. See get_signal_to_deliver(). - */ - signal->flags = why | SIGNAL_STOP_CONTINUED; - signal->group_stop_count = 0; - signal->group_exit_code = 0; - } - } - - return !sig_ignored(p, sig, from_ancestor_ns); -} - -/* - * Test if P wants to take SIG. After we've checked all threads with this, - * it's equivalent to finding no threads not blocking SIG. Any threads not - * blocking SIG were ruled out because they are not running and already - * have pending signals. Such threads will dequeue from the shared queue - * as soon as they're available, so putting the signal on the shared queue - * will be equivalent to sending it to one such thread. - */ -static inline int wants_signal(int sig, struct task_struct *p) -{ - if (sigismember(&p->blocked, sig)) - return 0; - if (p->flags & PF_EXITING) - return 0; - if (sig == SIGKILL) - return 1; - if (task_is_stopped_or_traced(p)) - return 0; - return task_curr(p) || !signal_pending(p); -} - -static void complete_signal(int sig, struct task_struct *p, int group) -{ - struct signal_struct *signal = p->signal; - struct task_struct *t; - - /* - * Now find a thread we can wake up to take the signal off the queue. - * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. - */ - if (wants_signal(sig, p)) - t = p; - else if (!group || thread_group_empty(p)) - /* - * There is just one thread and it does not need to be woken. - * It will dequeue unblocked signals before it runs again. - */ - return; - else { - /* - * Otherwise try to find a suitable thread. - */ - t = signal->curr_target; - while (!wants_signal(sig, t)) { - t = next_thread(t); - if (t == signal->curr_target) - /* - * No thread needs to be woken. - * Any eligible threads will see - * the signal in the queue soon. - */ - return; - } - signal->curr_target = t; - } - - /* - * Found a killable thread. If the signal will be fatal, - * then start taking the whole group down immediately. - */ - if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && - !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { - /* - * This signal will be fatal to the whole group. - */ - if (!sig_kernel_coredump(sig)) { - /* - * Start a group exit and wake everybody up. - * This way we don't have other threads - * running and doing things after a slower - * thread has the fatal signal pending. - */ - signal->flags = SIGNAL_GROUP_EXIT; - signal->group_exit_code = sig; - signal->group_stop_count = 0; - t = p; - do { - task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - } while_each_thread(p, t); - return; - } - } - - /* - * The signal is already in the shared-pending queue. - * Tell the chosen thread to wake up and dequeue it. - */ - signal_wake_up(t, sig == SIGKILL); - return; -} - -static inline int legacy_queue(struct sigpending *signals, int sig) -{ - return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); -} - -/* - * map the uid in struct cred into user namespace *ns - */ -static inline uid_t map_cred_ns(const struct cred *cred, - struct user_namespace *ns) -{ - return user_ns_map_uid(ns, cred, cred->uid); -} - -#ifdef CONFIG_USER_NS -static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) -{ - if (current_user_ns() == task_cred_xxx(t, user_ns)) - return; - - if (SI_FROMKERNEL(info)) - return; - - info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), - current_cred(), info->si_uid); -} -#else -static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) -{ - return; -} -#endif - -static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group, int from_ancestor_ns) -{ - struct sigpending *pending; - struct sigqueue *q; - int override_rlimit; - - trace_signal_generate(sig, info, t); - - assert_spin_locked(&t->sighand->siglock); - - if (!prepare_signal(sig, t, from_ancestor_ns)) - return 0; - - pending = group ? &t->signal->shared_pending : &t->pending; - /* - * Short-circuit ignored signals and support queuing - * exactly one non-rt signal, so that we can get more - * detailed information about the cause of the signal. - */ - if (legacy_queue(pending, sig)) - return 0; - /* - * fast-pathed signals for kernel-internal things like SIGSTOP - * or SIGKILL. - */ - if (info == SEND_SIG_FORCED) - goto out_set; - - /* - * Real-time signals must be queued if sent by sigqueue, or - * some other real-time mechanism. It is implementation - * defined whether kill() does so. We attempt to do so, on - * the principle of least surprise, but since kill is not - * allowed to fail with EAGAIN when low on memory we just - * make sure at least one signal gets delivered and don't - * pass on the info struct. - */ - if (sig < SIGRTMIN) - override_rlimit = (is_si_special(info) || info->si_code >= 0); - else - override_rlimit = 0; - - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); - if (q) { - list_add_tail(&q->list, &pending->list); - switch ((unsigned long) info) { - case (unsigned long) SEND_SIG_NOINFO: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_USER; - q->info.si_pid = task_tgid_nr_ns(current, - task_active_pid_ns(t)); - q->info.si_uid = current_uid(); - break; - case (unsigned long) SEND_SIG_PRIV: - q->info.si_signo = sig; - q->info.si_errno = 0; - q->info.si_code = SI_KERNEL; - q->info.si_pid = 0; - q->info.si_uid = 0; - break; - default: - copy_siginfo(&q->info, info); - if (from_ancestor_ns) - q->info.si_pid = 0; - break; - } - - userns_fixup_signal_uid(&q->info, t); - - } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) { - /* - * Queue overflow, abort. We may abort if the - * signal was rt and sent by user using something - * other than kill(). - */ - trace_signal_overflow_fail(sig, group, info); - return -EAGAIN; - } else { - /* - * This is a silent loss of information. We still - * send the signal, but the *info bits are lost. - */ - trace_signal_lose_info(sig, group, info); - } - } - -out_set: - signalfd_notify(t, sig); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); - return 0; -} - -static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) -{ - int from_ancestor_ns = 0; - -#ifdef CONFIG_PID_NS - from_ancestor_ns = si_fromuser(info) && - !task_pid_nr_ns(current, task_active_pid_ns(t)); -#endif - - return __send_signal(sig, info, t, group, from_ancestor_ns); -} - -static void print_fatal_signal(struct pt_regs *regs, int signr) -{ - printk("%s/%d: potentially unexpected fatal signal %d.\n", - current->comm, task_pid_nr(current), signr); - -#if defined(__i386__) && !defined(__arch_um__) - printk("code at %08lx: ", regs->ip); - { - int i; - for (i = 0; i < 16; i++) { - unsigned char insn; - - if (get_user(insn, (unsigned char *)(regs->ip + i))) - break; - printk("%02x ", insn); - } - } -#endif - printk("\n"); - preempt_disable(); - show_regs(regs); - preempt_enable(); -} - -static int __init setup_print_fatal_signals(char *str) -{ - get_option (&str, &print_fatal_signals); - - return 1; -} - -__setup("print-fatal-signals=", setup_print_fatal_signals); - -int -__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - return send_signal(sig, info, p, 1); -} - -static int -specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ - return send_signal(sig, info, t, 0); -} - -int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) -{ - unsigned long flags; - int ret = -ESRCH; - - if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); - unlock_task_sighand(p, &flags); - } - - return ret; -} - -/* - * Force a signal that the process can't ignore: if necessary - * we unblock the signal and change any SIG_IGN to SIG_DFL. - * - * Note: If we unblock the signal, we always reset it to SIG_DFL, - * since we do not want to have a signal handler that was blocked - * be invoked when user space had explicitly blocked it. - * - * We don't want to have recursive SIGSEGV's etc, for example, - * that is why we also clear SIGNAL_UNKILLABLE. - */ -int -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ - unsigned long int flags; - int ret, blocked, ignored; - struct k_sigaction *action; - - spin_lock_irqsave(&t->sighand->siglock, flags); - action = &t->sighand->action[sig-1]; - ignored = action->sa.sa_handler == SIG_IGN; - blocked = sigismember(&t->blocked, sig); - if (blocked || ignored) { - action->sa.sa_handler = SIG_DFL; - if (blocked) { - sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } - } - if (action->sa.sa_handler == SIG_DFL) - t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = specific_send_sig_info(sig, info, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); - - return ret; -} - -/* - * Nuke all other threads in the group. - */ -int zap_other_threads(struct task_struct *p) -{ - struct task_struct *t = p; - int count = 0; - - p->signal->group_stop_count = 0; - - while_each_thread(p, t) { - task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - count++; - - /* Don't bother with already dead threads */ - if (t->exit_state) - continue; - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - } - - return count; -} - -struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - struct sighand_struct *sighand; - - for (;;) { - local_irq_save(*flags); - rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) { - rcu_read_unlock(); - local_irq_restore(*flags); - break; - } - - spin_lock(&sighand->siglock); - if (likely(sighand == tsk->sighand)) { - rcu_read_unlock(); - break; - } - spin_unlock(&sighand->siglock); - rcu_read_unlock(); - local_irq_restore(*flags); - } - - return sighand; -} - -/* - * send signal info to all the members of a group - */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - int ret; - - rcu_read_lock(); - ret = check_kill_permission(sig, info, p); - rcu_read_unlock(); - - if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); - - return ret; -} - -/* - * __kill_pgrp_info() sends a signal to a process group: this is what the tty - * control characters do (^C, ^Z etc) - * - the caller must hold at least a readlock on tasklist_lock - */ -int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) -{ - struct task_struct *p = NULL; - int retval, success; - - success = 0; - retval = -ESRCH; - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); - success |= !err; - retval = err; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return success ? 0 : retval; -} - -int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) -{ - int error = -ESRCH; - struct task_struct *p; - - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); - - return error; -} - -int kill_proc_info(int sig, struct siginfo *info, pid_t pid) -{ - int error; - rcu_read_lock(); - error = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return error; -} - -static int kill_as_cred_perm(const struct cred *cred, - struct task_struct *target) -{ - const struct cred *pcred = __task_cred(target); - if (cred->user_ns != pcred->user_ns) - return 0; - if (cred->euid != pcred->suid && cred->euid != pcred->uid && - cred->uid != pcred->suid && cred->uid != pcred->uid) - return 0; - return 1; -} - -/* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, - const struct cred *cred, u32 secid) -{ - int ret = -EINVAL; - struct task_struct *p; - unsigned long flags; - - if (!valid_signal(sig)) - return ret; - - rcu_read_lock(); - p = pid_task(pid, PIDTYPE_PID); - if (!p) { - ret = -ESRCH; - goto out_unlock; - } - if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { - ret = -EPERM; - goto out_unlock; - } - ret = security_task_kill(p, info, sig, secid); - if (ret) - goto out_unlock; - - if (sig) { - if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, 1, 0); - unlock_task_sighand(p, &flags); - } else - ret = -ESRCH; - } -out_unlock: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); - -/* - * kill_something_info() interprets pid in interesting ways just like kill(2). - * - * POSIX specifies that kill(-1,sig) is unspecified, but what we have - * is probably wrong. Should make it like BSD or SYSV. - */ - -static int kill_something_info(int sig, struct siginfo *info, pid_t pid) -{ - int ret; - - if (pid > 0) { - rcu_read_lock(); - ret = kill_pid_info(sig, info, find_vpid(pid)); - rcu_read_unlock(); - return ret; - } - - read_lock(&tasklist_lock); - if (pid != -1) { - ret = __kill_pgrp_info(sig, info, - pid ? find_vpid(-pid) : task_pgrp(current)); - } else { - int retval = 0, count = 0; - struct task_struct * p; - - for_each_process(p) { - if (task_pid_vnr(p) > 1 && - !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); - ++count; - if (err != -EPERM) - retval = err; - } - } - ret = count ? retval : -ESRCH; - } - read_unlock(&tasklist_lock); - - return ret; -} - -/* - * These are for backward compatibility with the rest of the kernel source. - */ - -int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - /* - * Make sure legacy kernel users don't send in bad values - * (normal paths check this in check_kill_permission). - */ - if (!valid_signal(sig)) - return -EINVAL; - - return do_send_sig_info(sig, info, p, false); -} - -#define __si_special(priv) \ - ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) - -int -send_sig(int sig, struct task_struct *p, int priv) -{ - return send_sig_info(sig, __si_special(priv), p); -} - -void -force_sig(int sig, struct task_struct *p) -{ - force_sig_info(sig, SEND_SIG_PRIV, p); -} - -/* - * When things go south during signal handling, we - * will force a SIGSEGV. And if the signal that caused - * the problem was already a SIGSEGV, we'll want to - * make sure we don't even try to deliver the signal.. - */ -int -force_sigsegv(int sig, struct task_struct *p) -{ - if (sig == SIGSEGV) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } - force_sig(SIGSEGV, p); - return 0; -} - -int kill_pgrp(struct pid *pid, int sig, int priv) -{ - int ret; - - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); - read_unlock(&tasklist_lock); - - return ret; -} -EXPORT_SYMBOL(kill_pgrp); - -int kill_pid(struct pid *pid, int sig, int priv) -{ - return kill_pid_info(sig, __si_special(priv), pid); -} -EXPORT_SYMBOL(kill_pid); - -/* - * These functions support sending signals using preallocated sigqueue - * structures. This is needed "because realtime applications cannot - * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of POSIX Timers - * we allocate the sigqueue structure from the timer_create. If this - * allocation fails we are able to report the failure to the application - * with an EAGAIN error. - */ -struct sigqueue *sigqueue_alloc(void) -{ - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); - - if (q) - q->flags |= SIGQUEUE_PREALLOC; - - return q; -} - -void sigqueue_free(struct sigqueue *q) -{ - unsigned long flags; - spinlock_t *lock = ¤t->sighand->siglock; - - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - /* - * We must hold ->siglock while testing q->list - * to serialize with collect_signal() or with - * __exit_signal()->flush_sigqueue(). - */ - spin_lock_irqsave(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - /* - * If it is queued it will be freed when dequeued, - * like the "regular" sigqueue. - */ - if (!list_empty(&q->list)) - q = NULL; - spin_unlock_irqrestore(lock, flags); - - if (q) - __sigqueue_free(q); -} - -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) -{ - int sig = q->info.si_signo; - struct sigpending *pending; - unsigned long flags; - int ret; - - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - - ret = -1; - if (!likely(lock_task_sighand(t, &flags))) - goto ret; - - ret = 1; /* the signal is ignored */ - if (!prepare_signal(sig, t, 0)) - goto out; - - ret = 0; - if (unlikely(!list_empty(&q->list))) { - /* - * If an SI_TIMER entry is already queue just increment - * the overrun count. - */ - BUG_ON(q->info.si_code != SI_TIMER); - q->info.si_overrun++; - goto out; - } - q->info.si_overrun = 0; - - signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; - list_add_tail(&q->list, &pending->list); - sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); -out: - unlock_task_sighand(t, &flags); -ret: - return ret; -} - -/* - * Let a parent know about the death of a child. - * For a stopped/continued status change, use do_notify_parent_cldstop instead. - * - * Returns true if our parent ignored us and so we've switched to - * self-reaping. - */ -bool do_notify_parent(struct task_struct *tsk, int sig) -{ - struct siginfo info; - unsigned long flags; - struct sighand_struct *psig; - bool autoreap = false; - - BUG_ON(sig == -1); - - /* do_notify_parent_cldstop should have been called instead. */ - BUG_ON(task_is_stopped_or_traced(tsk)); - - BUG_ON(!tsk->ptrace && - (tsk->group_leader != tsk || !thread_group_empty(tsk))); - - info.si_signo = sig; - info.si_errno = 0; - /* - * we are under tasklist_lock here so our parent is tied to - * us and cannot exit and release its namespace. - * - * the only it can is to switch its nsproxy with sys_unshare, - * bu uncharing pid namespaces is not allowed, so we'll always - * see relevant namespace - * - * write_lock() currently calls preempt_disable() which is the - * same as rcu_read_lock(), but according to Oleg, this is not - * correct to rely on this - */ - rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(tsk->parent, user_ns)); - rcu_read_unlock(); - - info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); - - info.si_status = tsk->exit_code & 0x7f; - if (tsk->exit_code & 0x80) - info.si_code = CLD_DUMPED; - else if (tsk->exit_code & 0x7f) - info.si_code = CLD_KILLED; - else { - info.si_code = CLD_EXITED; - info.si_status = tsk->exit_code >> 8; - } - - psig = tsk->parent->sighand; - spin_lock_irqsave(&psig->siglock, flags); - if (!tsk->ptrace && sig == SIGCHLD && - (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || - (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { - /* - * We are exiting and our parent doesn't care. POSIX.1 - * defines special semantics for setting SIGCHLD to SIG_IGN - * or setting the SA_NOCLDWAIT flag: we should be reaped - * automatically and not left for our parent's wait4 call. - * Rather than having the parent do it as a magic kind of - * signal handler, we just set this to tell do_exit that we - * can be cleaned up without becoming a zombie. Note that - * we still call __wake_up_parent in this case, because a - * blocked sys_wait4 might now return -ECHILD. - * - * Whether we send SIGCHLD or not for SA_NOCLDWAIT - * is implementation-defined: we do (if you don't want - * it, just use SIG_IGN instead). - */ - autoreap = true; - if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; - } - if (valid_signal(sig) && sig) - __group_send_sig_info(sig, &info, tsk->parent); - __wake_up_parent(tsk, tsk->parent); - spin_unlock_irqrestore(&psig->siglock, flags); - - return autoreap; -} - -/** - * do_notify_parent_cldstop - notify parent of stopped/continued state change - * @tsk: task reporting the state change - * @for_ptracer: the notification is for ptracer - * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report - * - * Notify @tsk's parent that the stopped/continued state has changed. If - * @for_ptracer is %false, @tsk's group leader notifies to its real parent. - * If %true, @tsk reports to @tsk->parent which should be the ptracer. - * - * CONTEXT: - * Must be called with tasklist_lock at least read locked. - */ -static void do_notify_parent_cldstop(struct task_struct *tsk, - bool for_ptracer, int why) -{ - struct siginfo info; - unsigned long flags; - struct task_struct *parent; - struct sighand_struct *sighand; - - if (for_ptracer) { - parent = tsk->parent; - } else { - tsk = tsk->group_leader; - parent = tsk->real_parent; - } - - info.si_signo = SIGCHLD; - info.si_errno = 0; - /* - * see comment in do_notify_parent() about the following 4 lines - */ - rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(parent, user_ns)); - rcu_read_unlock(); - - info.si_utime = cputime_to_clock_t(tsk->utime); - info.si_stime = cputime_to_clock_t(tsk->stime); - - info.si_code = why; - switch (why) { - case CLD_CONTINUED: - info.si_status = SIGCONT; - break; - case CLD_STOPPED: - info.si_status = tsk->signal->group_exit_code & 0x7f; - break; - case CLD_TRAPPED: - info.si_status = tsk->exit_code & 0x7f; - break; - default: - BUG(); - } - - sighand = parent->sighand; - spin_lock_irqsave(&sighand->siglock, flags); - if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && - !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); - /* - * Even if SIGCHLD is not generated, we must wake up wait4 calls. - */ - __wake_up_parent(tsk, parent); - spin_unlock_irqrestore(&sighand->siglock, flags); -} - -static inline int may_ptrace_stop(void) -{ - if (!likely(current->ptrace)) - return 0; - /* - * Are we in the middle of do_coredump? - * If so and our tracer is also part of the coredump stopping - * is a deadlock situation, and pointless because our tracer - * is dead so don't allow us to stop. - * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_state != NULL. Otherwise it - * is safe to enter schedule(). - */ - if (unlikely(current->mm->core_state) && - unlikely(current->mm == current->parent->mm)) - return 0; - - return 1; -} - -/* - * Return non-zero if there is a SIGKILL that should be waking us up. - * Called with the siglock held. - */ -static int sigkill_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); -} - -/* - * This must be called with current->sighand->siglock held. - * - * This should be the path for all ptrace stops. - * We always set current->last_siginfo while stopped here. - * That makes it a way to test a stopped process for - * being ptrace-stopped vs being job-control-stopped. - * - * If we actually decide not to stop at all because the tracer - * is gone, we keep current->exit_code unless clear_code. - */ -static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) - __releases(¤t->sighand->siglock) - __acquires(¤t->sighand->siglock) -{ - bool gstop_done = false; - - if (arch_ptrace_stop_needed(exit_code, info)) { - /* - * The arch code has something special to do before a - * ptrace stop. This is allowed to block, e.g. for faults - * on user stack pages. We can't keep the siglock while - * calling arch_ptrace_stop, so we must release it now. - * To preserve proper semantics, we must do this before - * any signal bookkeeping like checking group_stop_count. - * Meanwhile, a SIGKILL could come in before we retake the - * siglock. That must prevent us from sleeping in TASK_TRACED. - * So after regaining the lock, we must check for SIGKILL. - */ - spin_unlock_irq(¤t->sighand->siglock); - arch_ptrace_stop(exit_code, info); - spin_lock_irq(¤t->sighand->siglock); - if (sigkill_pending(current)) - return; - } - - /* - * We're committing to trapping. TRACED should be visible before - * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). - * Also, transition to TRACED and updates to ->jobctl should be - * atomic with respect to siglock and should be done after the arch - * hook as siglock is released and regrabbed across it. - */ - set_current_state(TASK_TRACED); - - current->last_siginfo = info; - current->exit_code = exit_code; - - /* - * If @why is CLD_STOPPED, we're trapping to participate in a group - * stop. Do the bookkeeping. Note that if SIGCONT was delievered - * across siglock relocks since INTERRUPT was scheduled, PENDING - * could be clear now. We act as if SIGCONT is received after - * TASK_TRACED is entered - ignore it. - */ - if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) - gstop_done = task_participate_group_stop(current); - - /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ - task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); - if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) - task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); - - /* entering a trap, clear TRAPPING */ - task_clear_jobctl_trapping(current); - - spin_unlock_irq(¤t->sighand->siglock); - read_lock(&tasklist_lock); - if (may_ptrace_stop()) { - /* - * Notify parents of the stop. - * - * While ptraced, there are two parents - the ptracer and - * the real_parent of the group_leader. The ptracer should - * know about every stop while the real parent is only - * interested in the completion of group stop. The states - * for the two don't interact with each other. Notify - * separately unless they're gonna be duplicates. - */ - do_notify_parent_cldstop(current, true, why); - if (gstop_done && ptrace_reparented(current)) - do_notify_parent_cldstop(current, false, why); - - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - preempt_enable_no_resched(); - schedule(); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - * - * If @gstop_done, the ptracer went away between group stop - * completion and here. During detach, it would have set - * JOBCTL_STOP_PENDING on us and we'll re-enter - * TASK_STOPPED in do_signal_stop() on return, so notifying - * the real parent of the group stop completion is enough. - */ - if (gstop_done) - do_notify_parent_cldstop(current, false, why); - - __set_current_state(TASK_RUNNING); - if (clear_code) - current->exit_code = 0; - read_unlock(&tasklist_lock); - } - - /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - /* - * We are back. Now reacquire the siglock before touching - * last_siginfo, so that we are sure to have synchronized with - * any signal-sending on another CPU that wants to examine it. - */ - spin_lock_irq(¤t->sighand->siglock); - current->last_siginfo = NULL; - - /* LISTENING can be set only during STOP traps, clear it */ - current->jobctl &= ~JOBCTL_LISTENING; - - /* - * Queued signals ignored us while we were stopped for tracing. - * So check for any that we should take before resuming user mode. - * This sets TIF_SIGPENDING, but never clears it. - */ - recalc_sigpending_tsk(current); -} - -static void ptrace_do_notify(int signr, int exit_code, int why) -{ - siginfo_t info; - - memset(&info, 0, sizeof info); - info.si_signo = signr; - info.si_code = exit_code; - info.si_pid = task_pid_vnr(current); - info.si_uid = current_uid(); - - /* Let the debugger run. */ - ptrace_stop(exit_code, why, 1, &info); -} - -void ptrace_notify(int exit_code) -{ - BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - - spin_lock_irq(¤t->sighand->siglock); - ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); - spin_unlock_irq(¤t->sighand->siglock); -} - -/** - * do_signal_stop - handle group stop for SIGSTOP and other stop signals - * @signr: signr causing group stop if initiating - * - * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr - * and participate in it. If already set, participate in the existing - * group stop. If participated in a group stop (and thus slept), %true is - * returned with siglock released. - * - * If ptraced, this function doesn't handle stop itself. Instead, - * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock - * untouched. The caller must ensure that INTERRUPT trap handling takes - * places afterwards. - * - * CONTEXT: - * Must be called with @current->sighand->siglock held, which is released - * on %true return. - * - * RETURNS: - * %false if group stop is already cancelled or ptrace trap is scheduled. - * %true if participated in group stop. - */ -static bool do_signal_stop(int signr) - __releases(¤t->sighand->siglock) -{ - struct signal_struct *sig = current->signal; - - if (!(current->jobctl & JOBCTL_STOP_PENDING)) { - unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - struct task_struct *t; - - /* signr will be recorded in task->jobctl for retries */ - WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); - - if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || - unlikely(signal_group_exit(sig))) - return false; - /* - * There is no group stop already in progress. We must - * initiate one now. - * - * While ptraced, a task may be resumed while group stop is - * still in effect and then receive a stop signal and - * initiate another group stop. This deviates from the - * usual behavior as two consecutive stop signals can't - * cause two group stops when !ptraced. That is why we - * also check !task_is_stopped(t) below. - * - * The condition can be distinguished by testing whether - * SIGNAL_STOP_STOPPED is already set. Don't generate - * group_exit_code in such case. - * - * This is not necessary for SIGNAL_STOP_CONTINUED because - * an intervening stop signal is required to cause two - * continued events regardless of ptrace. - */ - if (!(sig->flags & SIGNAL_STOP_STOPPED)) - sig->group_exit_code = signr; - - sig->group_stop_count = 0; - - if (task_set_jobctl_pending(current, signr | gstop)) - sig->group_stop_count++; - - for (t = next_thread(current); t != current; - t = next_thread(t)) { - /* - * Setting state to TASK_STOPPED for a group - * stop is always done with the siglock held, - * so this check has no races. - */ - if (!task_is_stopped(t) && - task_set_jobctl_pending(t, signr | gstop)) { - sig->group_stop_count++; - if (likely(!(t->ptrace & PT_SEIZED))) - signal_wake_up(t, 0); - else - ptrace_trap_notify(t); - } - } - } - - if (likely(!current->ptrace)) { - int notify = 0; - - /* - * If there are no other threads in the group, or if there - * is a group stop in progress and we are the last to stop, - * report to the parent. - */ - if (task_participate_group_stop(current)) - notify = CLD_STOPPED; - - __set_current_state(TASK_STOPPED); - spin_unlock_irq(¤t->sighand->siglock); - - /* - * Notify the parent of the group stop completion. Because - * we're not holding either the siglock or tasklist_lock - * here, ptracer may attach inbetween; however, this is for - * group stop and should always be delivered to the real - * parent of the group leader. The new ptracer will get - * its notification when this task transitions into - * TASK_TRACED. - */ - if (notify) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, false, notify); - read_unlock(&tasklist_lock); - } - - /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); - return true; - } else { - /* - * While ptraced, group stop is handled by STOP trap. - * Schedule it and let the caller deal with it. - */ - task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); - return false; - } -} - -/** - * do_jobctl_trap - take care of ptrace jobctl traps - * - * When PT_SEIZED, it's used for both group stop and explicit - * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with - * accompanying siginfo. If stopped, lower eight bits of exit_code contain - * the stop signal; otherwise, %SIGTRAP. - * - * When !PT_SEIZED, it's used only for group stop trap with stop signal - * number as exit_code and no siginfo. - * - * CONTEXT: - * Must be called with @current->sighand->siglock held, which may be - * released and re-acquired before returning with intervening sleep. - */ -static void do_jobctl_trap(void) -{ - struct signal_struct *signal = current->signal; - int signr = current->jobctl & JOBCTL_STOP_SIGMASK; - - if (current->ptrace & PT_SEIZED) { - if (!signal->group_stop_count && - !(signal->flags & SIGNAL_STOP_STOPPED)) - signr = SIGTRAP; - WARN_ON_ONCE(!signr); - ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), - CLD_STOPPED); - } else { - WARN_ON_ONCE(!signr); - ptrace_stop(signr, CLD_STOPPED, 0, NULL); - current->exit_code = 0; - } -} - -static int ptrace_signal(int signr, siginfo_t *info, - struct pt_regs *regs, void *cookie) -{ - ptrace_signal_deliver(regs, cookie); - /* - * We do not check sig_kernel_stop(signr) but set this marker - * unconditionally because we do not know whether debugger will - * change signr. This flag has no meaning unless we are going - * to stop after return from ptrace_stop(). In this case it will - * be checked in do_signal_stop(), we should only stop if it was - * not cleared by SIGCONT while we were sleeping. See also the - * comment in dequeue_signal(). - */ - current->jobctl |= JOBCTL_STOP_DEQUEUED; - ptrace_stop(signr, CLD_TRAPPED, 0, info); - - /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; - if (signr == 0) - return signr; - - current->exit_code = 0; - - /* - * Update the siginfo structure if the signal has - * changed. If the debugger wanted something - * specific in the siginfo structure then it should - * have updated *info via PTRACE_SETSIGINFO. - */ - if (signr != info->si_signo) { - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; - rcu_read_lock(); - info->si_pid = task_pid_vnr(current->parent); - info->si_uid = map_cred_ns(__task_cred(current->parent), - current_user_ns()); - rcu_read_unlock(); - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); - signr = 0; - } - - return signr; -} - -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) -{ - struct sighand_struct *sighand = current->sighand; - struct signal_struct *signal = current->signal; - int signr; - -relock: - /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - spin_lock_irq(&sighand->siglock); - /* - * Every stopped thread goes here after wakeup. Check to see if - * we should notify the parent, prepare_signal(SIGCONT) encodes - * the CLD_ si_code into SIGNAL_CLD_MASK bits. - */ - if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - int why; - - if (signal->flags & SIGNAL_CLD_CONTINUED) - why = CLD_CONTINUED; - else - why = CLD_STOPPED; - - signal->flags &= ~SIGNAL_CLD_MASK; - - spin_unlock_irq(&sighand->siglock); - - /* - * Notify the parent that we're continuing. This event is - * always per-process and doesn't make whole lot of sense - * for ptracers, who shouldn't consume the state via - * wait(2) either, but, for backward compatibility, notify - * the ptracer of the group leader too unless it's gonna be - * a duplicate. - */ - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, false, why); - - if (ptrace_reparented(current->group_leader)) - do_notify_parent_cldstop(current->group_leader, - true, why); - read_unlock(&tasklist_lock); - - goto relock; - } - - for (;;) { - struct k_sigaction *ka; - - if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && - do_signal_stop(0)) - goto relock; - - if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { - do_jobctl_trap(); - spin_unlock_irq(&sighand->siglock); - goto relock; - } - - signr = dequeue_signal(current, ¤t->blocked, info); - - if (!signr) - break; /* will return 0 */ - - if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); - if (!signr) - continue; - } - - ka = &sighand->action[signr-1]; - - /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); - - if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ - continue; - if (ka->sa.sa_handler != SIG_DFL) { - /* Run the handler. */ - *return_ka = *ka; - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; - - break; /* will return non-zero "signr" value */ - } - - /* - * Now we are doing the default action for this signal. - */ - if (sig_kernel_ignore(signr)) /* Default is nothing. */ - continue; - - /* - * Global init gets no signals it doesn't want. - * Container-init gets no signals it doesn't want from same - * container. - * - * Note that if global/container-init sees a sig_kernel_only() - * signal here, the signal must have been generated internally - * or must have come from an ancestor namespace. In either - * case, the signal cannot be dropped. - */ - if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && - !sig_kernel_only(signr)) - continue; - - if (sig_kernel_stop(signr)) { - /* - * The default action is to stop all threads in - * the thread group. The job control signals - * do nothing in an orphaned pgrp, but SIGSTOP - * always works. Note that siglock needs to be - * dropped during the call to is_orphaned_pgrp() - * because of lock ordering with tasklist_lock. - * This allows an intervening SIGCONT to be posted. - * We need to check for that and bail out if necessary. - */ - if (signr != SIGSTOP) { - spin_unlock_irq(&sighand->siglock); - - /* signals can be posted during this window */ - - if (is_current_pgrp_orphaned()) - goto relock; - - spin_lock_irq(&sighand->siglock); - } - - if (likely(do_signal_stop(info->si_signo))) { - /* It released the siglock. */ - goto relock; - } - - /* - * We didn't actually stop, due to a race - * with SIGCONT or something like that. - */ - continue; - } - - spin_unlock_irq(&sighand->siglock); - - /* - * Anything else is fatal, maybe with a core dump. - */ - current->flags |= PF_SIGNALED; - - if (sig_kernel_coredump(signr)) { - if (print_fatal_signals) - print_fatal_signal(regs, info->si_signo); - /* - * If it was able to dump core, this kills all - * other threads in the group and synchronizes with - * their demise. If we lost the race with another - * thread getting here, it set group_exit_code - * first and our do_group_exit call below will use - * that value and ignore the one we pass it. - */ - do_coredump(info->si_signo, info->si_signo, regs); - } - - /* - * Death signals, no core dump. - */ - do_group_exit(info->si_signo); - /* NOTREACHED */ - } - spin_unlock_irq(&sighand->siglock); - return signr; -} - -/** - * block_sigmask - add @ka's signal mask to current->blocked - * @ka: action for @signr - * @signr: signal that has been successfully delivered - * - * This function should be called when a signal has succesfully been - * delivered. It adds the mask of signals for @ka to current->blocked - * so that they are blocked during the execution of the signal - * handler. In addition, @signr will be blocked unless %SA_NODEFER is - * set in @ka->sa.sa_flags. - */ -void block_sigmask(struct k_sigaction *ka, int signr) -{ - sigset_t blocked; - - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, signr); - set_current_blocked(&blocked); -} - -/* - * It could be that complete_signal() picked us to notify about the - * group-wide signal. Other threads should be notified now to take - * the shared signals in @which since we will not. - */ -static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) -{ - sigset_t retarget; - struct task_struct *t; - - sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); - if (sigisemptyset(&retarget)) - return; - - t = tsk; - while_each_thread(tsk, t) { - if (t->flags & PF_EXITING) - continue; - - if (!has_pending_signals(&retarget, &t->blocked)) - continue; - /* Remove the signals this thread can handle. */ - sigandsets(&retarget, &retarget, &t->blocked); - - if (!signal_pending(t)) - signal_wake_up(t, 0); - - if (sigisemptyset(&retarget)) - break; - } -} - -void exit_signals(struct task_struct *tsk) -{ - int group_stop = 0; - sigset_t unblocked; - - /* - * @tsk is about to have PF_EXITING set - lock out users which - * expect stable threadgroup. - */ - threadgroup_change_begin(tsk); - - if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { - tsk->flags |= PF_EXITING; - threadgroup_change_end(tsk); - return; - } - - spin_lock_irq(&tsk->sighand->siglock); - /* - * From now this task is not visible for group-wide signals, - * see wants_signal(), do_signal_stop(). - */ - tsk->flags |= PF_EXITING; - - threadgroup_change_end(tsk); - - if (!signal_pending(tsk)) - goto out; - - unblocked = tsk->blocked; - signotset(&unblocked); - retarget_shared_pending(tsk, &unblocked); - - if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && - task_participate_group_stop(tsk)) - group_stop = CLD_STOPPED; -out: - spin_unlock_irq(&tsk->sighand->siglock); - - /* - * If group stop has completed, deliver the notification. This - * should always go to the real parent of the group leader. - */ - if (unlikely(group_stop)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, false, group_stop); - read_unlock(&tasklist_lock); - } -} - -EXPORT_SYMBOL(recalc_sigpending); -EXPORT_SYMBOL_GPL(dequeue_signal); -EXPORT_SYMBOL(flush_signals); -EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(send_sig); -EXPORT_SYMBOL(send_sig_info); -EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - - -/* - * System call entry points. - */ - -/** - * sys_restart_syscall - restart a system call - */ -SYSCALL_DEFINE0(restart_syscall) -{ - struct restart_block *restart = ¤t_thread_info()->restart_block; - return restart->fn(restart); -} - -long do_no_restart_syscall(struct restart_block *param) -{ - return -EINTR; -} - -static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) -{ - if (signal_pending(tsk) && !thread_group_empty(tsk)) { - sigset_t newblocked; - /* A set of now blocked but previously unblocked signals. */ - sigandnsets(&newblocked, newset, ¤t->blocked); - retarget_shared_pending(tsk, &newblocked); - } - tsk->blocked = *newset; - recalc_sigpending(); -} - -/** - * set_current_blocked - change current->blocked mask - * @newset: new mask - * - * It is wrong to change ->blocked directly, this helper should be used - * to ensure the process can't miss a shared signal we are going to block. - */ -void set_current_blocked(const sigset_t *newset) -{ - struct task_struct *tsk = current; - - spin_lock_irq(&tsk->sighand->siglock); - __set_task_blocked(tsk, newset); - spin_unlock_irq(&tsk->sighand->siglock); -} - -/* - * This is also useful for kernel threads that want to temporarily - * (or permanently) block certain signals. - * - * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel - * interface happily blocks "unblockable" signals like SIGKILL - * and friends. - */ -int sigprocmask(int how, sigset_t *set, sigset_t *oldset) -{ - struct task_struct *tsk = current; - sigset_t newset; - - /* Lockless, only current can change ->blocked, never from irq */ - if (oldset) - *oldset = tsk->blocked; - - switch (how) { - case SIG_BLOCK: - sigorsets(&newset, &tsk->blocked, set); - break; - case SIG_UNBLOCK: - sigandnsets(&newset, &tsk->blocked, set); - break; - case SIG_SETMASK: - newset = *set; - break; - default: - return -EINVAL; - } - - set_current_blocked(&newset); - return 0; -} - -/** - * sys_rt_sigprocmask - change the list of currently blocked signals - * @how: whether to add, remove, or set signals - * @nset: stores pending signals - * @oset: previous value of signal mask if non-null - * @sigsetsize: size of sigset_t type - */ -SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, - sigset_t __user *, oset, size_t, sigsetsize) -{ - sigset_t old_set, new_set; - int error; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - old_set = current->blocked; - - if (nset) { - if (copy_from_user(&new_set, nset, sizeof(sigset_t))) - return -EFAULT; - sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - error = sigprocmask(how, &new_set, NULL); - if (error) - return error; - } - - if (oset) { - if (copy_to_user(oset, &old_set, sizeof(sigset_t))) - return -EFAULT; - } - - return 0; -} - -long do_sigpending(void __user *set, unsigned long sigsetsize) -{ - long error = -EINVAL; - sigset_t pending; - - if (sigsetsize > sizeof(sigset_t)) - goto out; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(&pending, ¤t->pending.signal, - ¤t->signal->shared_pending.signal); - spin_unlock_irq(¤t->sighand->siglock); - - /* Outside the lock because only this thread touches it. */ - sigandsets(&pending, ¤t->blocked, &pending); - - error = -EFAULT; - if (!copy_to_user(set, &pending, sigsetsize)) - error = 0; - -out: - return error; -} - -/** - * sys_rt_sigpending - examine a pending signal that has been raised - * while blocked - * @set: stores pending signals - * @sigsetsize: size of sigset_t type or larger - */ -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) -{ - return do_sigpending(set, sigsetsize); -} - -#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER - -int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) -{ - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) - return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)) - ? -EFAULT : 0; - /* - * If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * Please remember to update the signalfd_copyinfo() function - * inside fs/signalfd.c too, in case siginfo_t changes. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code & __SI_MASK) { - case __SI_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case __SI_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case __SI_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case __SI_FAULT: - err |= __put_user(from->si_addr, &to->si_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(from->si_trapno, &to->si_trapno); -#endif -#ifdef BUS_MCEERR_AO - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif - break; - case __SI_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - } - return err; -} - -#endif - -/** - * do_sigtimedwait - wait for queued signals specified in @which - * @which: queued signals to wait for - * @info: if non-null, the signal's siginfo is returned here - * @ts: upper bound on process time suspension - */ -int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) -{ - struct task_struct *tsk = current; - long timeout = MAX_SCHEDULE_TIMEOUT; - sigset_t mask = *which; - int sig; - - if (ts) { - if (!timespec_valid(ts)) - return -EINVAL; - timeout = timespec_to_jiffies(ts); - /* - * We can be close to the next tick, add another one - * to ensure we will wait at least the time asked for. - */ - if (ts->tv_sec || ts->tv_nsec) - timeout++; - } - - /* - * Invert the set of allowed signals to get those we want to block. - */ - sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - signotset(&mask); - - spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info); - if (!sig && timeout) { - /* - * None ready, temporarily unblock those we're interested - * while we are sleeping in so that we'll be awakened when - * they arrive. Unblocking is always fine, we can avoid - * set_current_blocked(). - */ - tsk->real_blocked = tsk->blocked; - sigandsets(&tsk->blocked, &tsk->blocked, &mask); - recalc_sigpending(); - spin_unlock_irq(&tsk->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(&tsk->sighand->siglock); - __set_task_blocked(tsk, &tsk->real_blocked); - siginitset(&tsk->real_blocked, 0); - sig = dequeue_signal(tsk, &mask, info); - } - spin_unlock_irq(&tsk->sighand->siglock); - - if (sig) - return sig; - return timeout ? -EINTR : -EAGAIN; -} - -/** - * sys_rt_sigtimedwait - synchronously wait for queued signals specified - * in @uthese - * @uthese: queued signals to wait for - * @uinfo: if non-null, the signal's siginfo is returned here - * @uts: upper bound on process time suspension - * @sigsetsize: size of sigset_t type - */ -SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, - size_t, sigsetsize) -{ - sigset_t these; - struct timespec ts; - siginfo_t info; - int ret; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&these, uthese, sizeof(these))) - return -EFAULT; - - if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) - return -EFAULT; - } - - ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); - - if (ret > 0 && uinfo) { - if (copy_siginfo_to_user(uinfo, &info)) - ret = -EFAULT; - } - - return ret; -} - -/** - * sys_kill - send a signal to a process - * @pid: the PID of the process - * @sig: signal to be sent - */ -SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) -{ - struct siginfo info; - - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); - - return kill_something_info(sig, &info, pid); -} - -static int -do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) -{ - struct task_struct *p; - int error = -ESRCH; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, info, p); - /* - * The null signal is a permissions and process existence - * probe. No signal is actually delivered. - */ - if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); - /* - * If lock_task_sighand() failed we pretend the task - * dies after receiving the signal. The window is tiny, - * and the signal is private anyway. - */ - if (unlikely(error == -ESRCH)) - error = 0; - } - } - rcu_read_unlock(); - - return error; -} - -static int do_tkill(pid_t tgid, pid_t pid, int sig) -{ - struct siginfo info; - - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); - - return do_send_specific(tgid, pid, sig, &info); -} - -/** - * sys_tgkill - send signal to one specific thread - * @tgid: the thread group ID of the thread - * @pid: the PID of the thread - * @sig: signal to be sent - * - * This syscall also checks the @tgid and returns -ESRCH even if the PID - * exists but it's not belonging to the target process anymore. This - * method solves the problem of threads exiting and PIDs getting reused. - */ -SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) -{ - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - - return do_tkill(tgid, pid, sig); -} - -/** - * sys_tkill - send signal to one specific task - * @pid: the PID of the task - * @sig: signal to be sent - * - * Send a signal to only one task, even if it's a CLONE_THREAD task. - */ -SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) -{ - /* This is only valid for single tasks */ - if (pid <= 0) - return -EINVAL; - - return do_tkill(0, pid, sig); -} - -/** - * sys_rt_sigqueueinfo - send signal information to a signal - * @pid: the PID of the thread - * @sig: signal to be sent - * @uinfo: signal info to be sent - */ -SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, - siginfo_t __user *, uinfo) -{ - siginfo_t info; - - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) - return -EFAULT; - - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info.si_code >= 0 || info.si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info.si_code < 0); - return -EPERM; - } - info.si_signo = sig; - - /* POSIX.1b doesn't mention process groups. */ - return kill_proc_info(sig, &info, pid); -} - -long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) -{ - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info->si_code >= 0 || info->si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); - return -EPERM; - } - info->si_signo = sig; - - return do_send_specific(tgid, pid, sig, info); -} - -SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, - siginfo_t __user *, uinfo) -{ - siginfo_t info; - - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) - return -EFAULT; - - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); -} - -int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) -{ - struct task_struct *t = current; - struct k_sigaction *k; - sigset_t mask; - - if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) - return -EINVAL; - - k = &t->sighand->action[sig-1]; - - spin_lock_irq(¤t->sighand->siglock); - if (oact) - *oact = *k; - - if (act) { - sigdelsetmask(&act->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); - *k = *act; - /* - * POSIX 3.3.1.3: - * "Setting a signal action to SIG_IGN for a signal that is - * pending shall cause the pending signal to be discarded, - * whether or not it is blocked." - * - * "Setting a signal action to SIG_DFL for a signal that is - * pending and whose default action is to ignore the signal - * (for example, SIGCHLD), shall cause the pending signal to - * be discarded, whether or not it is blocked" - */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { - sigemptyset(&mask); - sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); - do { - rm_from_queue_full(&mask, &t->pending); - t = next_thread(t); - } while (t != current); - } - } - - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -int -do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) -{ - stack_t oss; - int error; - - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); - - if (uss) { - void __user *ss_sp; - size_t ss_size; - int ss_flags; - - error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) - goto out; - error = __get_user(ss_sp, &uss->ss_sp) | - __get_user(ss_flags, &uss->ss_flags) | - __get_user(ss_size, &uss->ss_size); - if (error) - goto out; - - error = -EPERM; - if (on_sig_stack(sp)) - goto out; - - error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) - goto out; - - if (ss_flags == SS_DISABLE) { - ss_size = 0; - ss_sp = NULL; - } else { - error = -ENOMEM; - if (ss_size < MINSIGSTKSZ) - goto out; - } - - current->sas_ss_sp = (unsigned long) ss_sp; - current->sas_ss_size = ss_size; - } - - error = 0; - if (uoss) { - error = -EFAULT; - if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) - goto out; - error = __put_user(oss.ss_sp, &uoss->ss_sp) | - __put_user(oss.ss_size, &uoss->ss_size) | - __put_user(oss.ss_flags, &uoss->ss_flags); - } - -out: - return error; -} - -#ifdef __ARCH_WANT_SYS_SIGPENDING - -/** - * sys_sigpending - examine pending signals - * @set: where mask of pending signal is returned - */ -SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) -{ - return do_sigpending(set, sizeof(*set)); -} - -#endif - -#ifdef __ARCH_WANT_SYS_SIGPROCMASK -/** - * sys_sigprocmask - examine and change blocked signals - * @how: whether to add, remove, or set signals - * @nset: signals to add or remove (if non-null) - * @oset: previous value of signal mask if non-null - * - * Some platforms have their own version with special arguments; - * others support only sys_rt_sigprocmask. - */ - -SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, - old_sigset_t __user *, oset) -{ - old_sigset_t old_set, new_set; - sigset_t new_blocked; - - old_set = current->blocked.sig[0]; - - if (nset) { - if (copy_from_user(&new_set, nset, sizeof(*nset))) - return -EFAULT; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - - new_blocked = current->blocked; - - switch (how) { - case SIG_BLOCK: - sigaddsetmask(&new_blocked, new_set); - break; - case SIG_UNBLOCK: - sigdelsetmask(&new_blocked, new_set); - break; - case SIG_SETMASK: - new_blocked.sig[0] = new_set; - break; - default: - return -EINVAL; - } - - set_current_blocked(&new_blocked); - } - - if (oset) { - if (copy_to_user(oset, &old_set, sizeof(*oset))) - return -EFAULT; - } - - return 0; -} -#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ - -#ifdef __ARCH_WANT_SYS_RT_SIGACTION -/** - * sys_rt_sigaction - alter an action taken by a process - * @sig: signal to be sent - * @act: new sigaction - * @oact: used to save the previous sigaction - * @sigsetsize: size of sigset_t type - */ -SYSCALL_DEFINE4(rt_sigaction, int, sig, - const struct sigaction __user *, act, - struct sigaction __user *, oact, - size_t, sigsetsize) -{ - struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - goto out; - - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } - - ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); - - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; -} -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ - -#ifdef __ARCH_WANT_SYS_SGETMASK - -/* - * For backwards compatibility. Functionality superseded by sigprocmask. - */ -SYSCALL_DEFINE0(sgetmask) -{ - /* SMP safe */ - return current->blocked.sig[0]; -} - -SYSCALL_DEFINE1(ssetmask, int, newmask) -{ - int old = current->blocked.sig[0]; - sigset_t newset; - - siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); - set_current_blocked(&newset); - - return old; -} -#endif /* __ARCH_WANT_SGETMASK */ - -#ifdef __ARCH_WANT_SYS_SIGNAL -/* - * For backwards compatibility. Functionality superseded by sigaction. - */ -SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) -{ - struct k_sigaction new_sa, old_sa; - int ret; - - new_sa.sa.sa_handler = handler; - new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; - sigemptyset(&new_sa.sa.sa_mask); - - ret = do_sigaction(sig, &new_sa, &old_sa); - - return ret ? ret : (unsigned long)old_sa.sa.sa_handler; -} -#endif /* __ARCH_WANT_SYS_SIGNAL */ - -#ifdef __ARCH_WANT_SYS_PAUSE - -SYSCALL_DEFINE0(pause) -{ - while (!signal_pending(current)) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - } - return -ERESTARTNOHAND; -} - -#endif - -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -/** - * sys_rt_sigsuspend - replace the signal mask for a value with the - * @unewset value until a signal is received - * @unewset: new signal mask value - * @sigsetsize: size of sigset_t type - */ -SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) -{ - sigset_t newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; -} -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ - -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -void __init signals_init(void) -{ - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); -} - -#ifdef CONFIG_KGDB_KDB -#include -/* - * kdb_send_sig_info - Allows kdb to send signals without exposing - * signal internals. This function checks if the required locks are - * available before calling the main signal code, to avoid kdb - * deadlocks. - */ -void -kdb_send_sig_info(struct task_struct *t, struct siginfo *info) -{ - static struct task_struct *kdb_prev_t; - int sig, new_t; - if (!spin_trylock(&t->sighand->siglock)) { - kdb_printf("Can't do kill command now.\n" - "The sigmask lock is held somewhere else in " - "kernel, try again later\n"); - return; - } - spin_unlock(&t->sighand->siglock); - new_t = kdb_prev_t != t; - kdb_prev_t = t; - if (t->state != TASK_RUNNING && new_t) { - kdb_printf("Process is not RUNNING, sending a signal from " - "kdb risks deadlock\n" - "on the run queue locks. " - "The signal has _not_ been sent.\n" - "Reissue the kill command if you want to risk " - "the deadlock.\n"); - return; - } - sig = info->si_signo; - if (send_sig_info(sig, info, t)) - kdb_printf("Fail to deliver Signal %d to process %d.\n", - sig, t->pid); - else - kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); -} -#endif /* CONFIG_KGDB_KDB */ -/* - * Generic helpers for smp ipi calls - * - * (C) Jens Axboe 2008 - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static struct { - struct list_head queue; - raw_spinlock_t lock; -} call_function __cacheline_aligned_in_smp = - { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), - }; - -enum { - CSD_FLAG_LOCK = 0x01, -}; - -struct call_function_data { - struct call_single_data csd; - atomic_t refs; - cpumask_var_t cpumask; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); - -struct call_single_queue { - struct list_head list; - raw_spinlock_t lock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); - -static int -hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, - cpu_to_node(cpu))) - return notifier_from_errno(-ENOMEM); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_cpumask_var(cfd->cpumask); - break; -#endif - }; - - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { - .notifier_call = hotplug_cfd, -}; - -void __init call_function_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int i; - - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - raw_spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } - - hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); - register_cpu_notifier(&hotplug_cfd_notifier); -} - -/* - * csd_lock/csd_unlock used to serialize access to per-cpu csd resources - * - * For non-synchronous ipi calls the csd can still be in use by the - * previous function call. For multi-cpu calls its even more interesting - * as we'll have to ensure no other cpu is observing our csd. - */ -static void csd_lock_wait(struct call_single_data *data) -{ - while (data->flags & CSD_FLAG_LOCK) - cpu_relax(); -} - -static void csd_lock(struct call_single_data *data) -{ - csd_lock_wait(data); - data->flags = CSD_FLAG_LOCK; - - /* - * prevent CPU from reordering the above assignment - * to ->flags with any subsequent assignments to other - * fields of the specified call_single_data structure: - */ - smp_mb(); -} - -static void csd_unlock(struct call_single_data *data) -{ - WARN_ON(!(data->flags & CSD_FLAG_LOCK)); - - /* - * ensure we're all done before releasing data: - */ - smp_mb(); - - data->flags &= ~CSD_FLAG_LOCK; -} - -/* - * Insert a previously allocated call_single_data element - * for execution on the given CPU. data must already have - * ->func, ->info, and ->flags set. - */ -static -void generic_exec_single(int cpu, struct call_single_data *data, int wait) -{ - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - unsigned long flags; - int ipi; - - raw_spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (ipi) - arch_send_call_function_single_ipi(cpu); - - if (wait) - csd_lock_wait(data); -} - -/* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = smp_processor_id(); - - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(cpu)); - - /* - * Ensure entry is visible on call_function_queue after we have - * entered the IPI. See comment in smp_call_function_many. - * If we don't have this, then we may miss an entry on the list - * and never get another IPI to process it. - */ - smp_mb(); - - /* - * It's ok to use list_for_each_rcu() here even though we may - * delete 'pos', since list_del_rcu() doesn't clear ->next - */ - list_for_each_entry_rcu(data, &call_function.queue, csd.list) { - int refs; - smp_call_func_t func; - - /* - * Since we walk the list without any locks, we might - * see an entry that was completed, removed from the - * list and is in the process of being reused. - * - * We must check that the cpu is in the cpumask before - * checking the refs, and both must be set before - * executing the callback on this cpu. - */ - - if (!cpumask_test_cpu(cpu, data->cpumask)) - continue; - - smp_rmb(); - - if (atomic_read(&data->refs) == 0) - continue; - - func = data->csd.func; /* save for later warn */ - func(data->csd.info); - - /* - * If the cpu mask is not still set then func enabled - * interrupts (BUG), and this cpu took another smp call - * function interrupt and executed func(info) twice - * on this cpu. That nested execution decremented refs. - */ - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pf enabled interrupts and double executed\n", func); - continue; - } - - refs = atomic_dec_return(&data->refs); - WARN_ON(refs < 0); - - if (refs) - continue; - - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - - csd_unlock(&data->csd); - } - -} - -/* - * Invoked by arch to handle an IPI for call function single. Must be - * called from the arch with interrupts disabled. - */ -void generic_smp_call_function_single_interrupt(void) -{ - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - unsigned int data_flags; - LIST_HEAD(list); - - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(smp_processor_id())); - - raw_spin_lock(&q->lock); - list_replace_init(&q->list, &list); - raw_spin_unlock(&q->lock); - - while (!list_empty(&list)) { - struct call_single_data *data; - - data = list_entry(list.next, struct call_single_data, list); - list_del(&data->list); - - /* - * 'data' can be invalid after this call if flags == 0 - * (when called through generic_exec_single()), - * so save them away before making the call: - */ - data_flags = data->flags; - - data->func(data->info); - - /* - * Unlocked CSDs are valid through generic_exec_single(): - */ - if (data_flags & CSD_FLAG_LOCK) - csd_unlock(data); - } -} - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); - -/* - * smp_call_function_single - Run a function on a specific CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - */ -int smp_call_function_single(int cpu, smp_call_func_t func, void *info, - int wait) -{ - struct call_single_data d = { - .flags = 0, - }; - unsigned long flags; - int this_cpu; - int err = 0; - - /* - * prevent preemption and reschedule on another processor, - * as well as CPU removal - */ - this_cpu = get_cpu(); - - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); - - if (cpu == this_cpu) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else { - if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = &d; - - if (!wait) - data = &__get_cpu_var(csd_data); - - csd_lock(data); - - data->func = func; - data->info = info; - generic_exec_single(cpu, data, wait); - } else { - err = -ENXIO; /* CPU not online */ - } - } - - put_cpu(); - - return err; -} -EXPORT_SYMBOL(smp_call_function_single); - -/* - * smp_call_function_any - Run a function on any of the given cpus - * @mask: The mask of cpus it can run on. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait until function has completed. - * - * Returns 0 on success, else a negative status code (if no cpus were online). - * Note that @wait will be implicitly turned on in case of allocation failures, - * since we fall back to on-stack allocation. - * - * Selection preference: - * 1) current cpu if in @mask - * 2) any cpu of current node if in @mask - * 3) any other online cpu in @mask - */ -int smp_call_function_any(const struct cpumask *mask, - smp_call_func_t func, void *info, int wait) -{ - unsigned int cpu; - const struct cpumask *nodemask; - int ret; - - /* Try for same CPU (cheapest) */ - cpu = get_cpu(); - if (cpumask_test_cpu(cpu, mask)) - goto call; - - /* Try for same node. */ - nodemask = cpumask_of_node(cpu_to_node(cpu)); - for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; - cpu = cpumask_next_and(cpu, nodemask, mask)) { - if (cpu_online(cpu)) - goto call; - } - - /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ - cpu = cpumask_any_and(mask, cpu_online_mask); -call: - ret = smp_call_function_single(cpu, func, info, wait); - put_cpu(); - return ret; -} -EXPORT_SYMBOL_GPL(smp_call_function_any); - -/** - * __smp_call_function_single(): Run a function on a specific CPU - * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure - * @wait: If true, wait until function has completed on specified CPU. - * - * Like smp_call_function_single(), but allow caller to pass in a - * pre-allocated data structure. Useful for embedding @data inside - * other structures, for instance. - */ -void __smp_call_function_single(int cpu, struct call_single_data *data, - int wait) -{ - unsigned int this_cpu; - unsigned long flags; - - this_cpu = get_cpu(); - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() - && !oops_in_progress); - - if (cpu == this_cpu) { - local_irq_save(flags); - data->func(data->info); - local_irq_restore(flags); - } else { - csd_lock(data); - generic_exec_single(cpu, data, wait); - } - put_cpu(); -} - -/** - * smp_call_function_many(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on (only runs on online subset). - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. Preemption - * must be disabled when calling this function. - */ -void smp_call_function_many(const struct cpumask *mask, - smp_call_func_t func, void *info, bool wait) -{ - struct call_function_data *data; - unsigned long flags; - int refs, cpu, next_cpu, this_cpu = smp_processor_id(); - - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); - - /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ - cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == this_cpu) - cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - - /* No online cpus? We're done. */ - if (cpu >= nr_cpu_ids) - return; - - /* Do we have another CPU which isn't us? */ - next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (next_cpu == this_cpu) - next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); - - /* Fastpath: do that cpu by itself. */ - if (next_cpu >= nr_cpu_ids) { - smp_call_function_single(cpu, func, info, wait); - return; - } - - data = &__get_cpu_var(cfd_data); - csd_lock(&data->csd); - - /* This BUG_ON verifies our reuse assertions and can be removed */ - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); - - /* - * The global call function queue list add and delete are protected - * by a lock, but the list is traversed without any lock, relying - * on the rcu list add and delete to allow safe concurrent traversal. - * We reuse the call function data without waiting for any grace - * period after some other cpu removes it from the global queue. - * This means a cpu might find our data block as it is being - * filled out. - * - * We hold off the interrupt handler on the other cpu by - * ordering our writes to the cpu mask vs our setting of the - * refs counter. We assert only the cpu owning the data block - * will set a bit in cpumask, and each bit will only be cleared - * by the subject cpu. Each cpu must first find its bit is - * set and then check that refs is set indicating the element is - * ready to be processed, otherwise it must skip the entry. - * - * On the previous iteration refs was set to 0 by another cpu. - * To avoid the use of transitivity, set the counter to 0 here - * so the wmb will pair with the rmb in the interrupt handler. - */ - atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ - - data->csd.func = func; - data->csd.info = info; - - /* Ensure 0 refs is visible before mask. Also orders func and info */ - smp_wmb(); - - /* We rely on the "and" being processed before the store */ - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); - refs = cpumask_weight(data->cpumask); - - /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!refs)) { - csd_unlock(&data->csd); - return; - } - - raw_spin_lock_irqsave(&call_function.lock, flags); - /* - * Place entry at the _HEAD_ of the list, so that any cpu still - * observing the entry in generic_smp_call_function_interrupt() - * will not miss any other list entries: - */ - list_add_rcu(&data->csd.list, &call_function.queue); - /* - * We rely on the wmb() in list_add_rcu to complete our writes - * to the cpumask before this write to refs, which indicates - * data is on the list and is ready to be processed. - */ - atomic_set(&data->refs, refs); - raw_spin_unlock_irqrestore(&call_function.lock, flags); - - /* - * Make the list addition visible before sending the ipi. - * (IPIs must obey or appear to obey normal Linux cache - * coherency rules -- see comment in generic_exec_single). - */ - smp_mb(); - - /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask); - - /* Optionally wait for the CPUs to complete */ - if (wait) - csd_lock_wait(&data->csd); -} -EXPORT_SYMBOL(smp_call_function_many); - -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * Returns 0. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(smp_call_func_t func, void *info, int wait) -{ - preempt_disable(); - smp_call_function_many(cpu_online_mask, func, info, wait); - preempt_enable(); - - return 0; -} -EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - raw_spin_lock(&call_function.lock); -} - -void ipi_call_unlock(void) -{ - raw_spin_unlock(&call_function.lock); -} - -void ipi_call_lock_irq(void) -{ - raw_spin_lock_irq(&call_function.lock); -} - -void ipi_call_unlock_irq(void) -{ - raw_spin_unlock_irq(&call_function.lock); -} -#endif /* USE_GENERIC_SMP_HELPERS */ - -/* Setup configured maximum number of CPUs to activate */ -unsigned int setup_max_cpus = NR_CPUS; -EXPORT_SYMBOL(setup_max_cpus); - - -/* - * Setup routine for controlling SMP activation - * - * Command-line option of "nosmp" or "maxcpus=0" will disable SMP - * activation entirely (the MPS table probe still happens, though). - * - * Command-line option of "maxcpus=", where is an integer - * greater than 0, limits the maximum number of CPUs activated in - * SMP mode to . - */ - -void __weak arch_disable_smp_support(void) { } - -static int __init nosmp(char *str) -{ - setup_max_cpus = 0; - arch_disable_smp_support(); - - return 0; -} - -early_param("nosmp", nosmp); - -/* this is hard limit */ -static int __init nrcpus(char *str) -{ - int nr_cpus; - - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) - nr_cpu_ids = nr_cpus; - - return 0; -} - -early_param("nr_cpus", nrcpus); - -static int __init maxcpus(char *str) -{ - get_option(&str, &setup_max_cpus); - if (setup_max_cpus == 0) - arch_disable_smp_support(); - - return 0; -} - -early_param("maxcpus", maxcpus); - -/* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - -/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ -void __init setup_nr_cpu_ids(void) -{ - nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; -} - -/* Called by boot processor to activate the rest. */ -void __init smp_init(void) -{ - unsigned int cpu; - - /* FIXME: This should be done in userspace --RR */ - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) - break; - if (!cpu_online(cpu)) - cpu_up(cpu); - } - - /* Any cleanup work */ - printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(setup_max_cpus); -} - -/* - * Call a function on all processors. May be used during early boot while - * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead - * of local_irq_disable/enable(). - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ - unsigned long flags; - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, wait); - local_irq_save(flags); - func(info); - local_irq_restore(flags); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(on_each_cpu); -/* - * linux/kernel/softirq.c - * - * Copyright (C) 1992 Linus Torvalds - * - * Distribute under GPLv2. - * - * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#include -/* - - No shared variables, all the data are CPU local. - - If a softirq needs serialization, let it serialize itself - by its own spinlocks. - - Even if softirq is serialized, only local cpu is marked for - execution. Hence, we get something sort of weak cpu binding. - Though it is still not clear, will it result in better locality - or will not. - - Examples: - - NET RX softirq. It is multithreaded and does not require - any global serialization. - - NET TX softirq. It kicks software netdevice queues, hence - it is logically serialized per device, but this serialization - is invisible to common code. - - Tasklets: serialized wrt itself. - */ - -#ifndef __ARCH_IRQ_STAT -irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; -EXPORT_SYMBOL(irq_stat); -#endif - -static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; - -DEFINE_PER_CPU(struct task_struct *, ksoftirqd); - -char *softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER", "RCU" -}; - -/* - * we cannot loop indefinitely here to avoid userspace starvation, - * but we also don't want to introduce a worst case 1/HZ latency - * to the pending events, so lets the scheduler to balance - * the softirq load for us. - */ -static void wakeup_softirqd(void) -{ - /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (tsk && tsk->state != TASK_RUNNING) - wake_up_process(tsk); -} - -/* - * preempt_count and SOFTIRQ_OFFSET usage: - * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving - * softirq processing. - * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) - * on local_bh_disable or local_bh_enable. - * This lets us distinguish between whether we are currently processing - * softirq and whether we just have bh disabled. - */ - -/* - * This one is for softirq.c-internal use, - * where hardirqs are disabled legitimately: - */ -#ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - unsigned long flags; - - WARN_ON_ONCE(in_irq()); - - raw_local_irq_save(flags); - /* - * The preempt tracer hooks into add_preempt_count and will break - * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET - * is set and before current->softirq_enabled is cleared. - * We must manually increment preempt_count here and manually - * call the trace_preempt_off later. - */ - preempt_count() += cnt; - /* - * Were softirqs turned off above: - */ - if (softirq_count() == cnt) - trace_softirqs_off(ip); - raw_local_irq_restore(flags); - - if (preempt_count() == cnt) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -} -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - add_preempt_count(cnt); - barrier(); -} -#endif /* CONFIG_TRACE_IRQFLAGS */ - -void local_bh_disable(void) -{ - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(local_bh_disable); - -static void __local_bh_enable(unsigned int cnt) -{ - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(cnt); -} - -/* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), - * without processing still-pending softirqs: - */ -void _local_bh_enable(void) -{ - __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(_local_bh_enable); - -static inline void _local_bh_enable_ip(unsigned long ip) -{ - WARN_ON_ONCE(in_irq() || irqs_disabled()); -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_disable(); -#endif - /* - * Are softirqs going to be turned on now: - */ - if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) - trace_softirqs_on(ip); - /* - * Keep preemption disabled until we are done with - * softirq processing: - */ - sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); - - if (unlikely(!in_interrupt() && local_softirq_pending())) - do_softirq(); - - dec_preempt_count(); -#ifdef CONFIG_TRACE_IRQFLAGS - local_irq_enable(); -#endif - preempt_check_resched(); -} - -void local_bh_enable(void) -{ - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); - -/* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. - * - * This number has been established via experimentation. - * The two things to balance is latency against fairness - - * we want to handle softirqs as soon as possible, but they - * should not be able to lock up the box. - */ -#define MAX_SOFTIRQ_RESTART 10 - -asmlinkage void __do_softirq(void) -{ - struct softirq_action *h; - __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; - int cpu; - - pending = local_softirq_pending(); - account_system_vtime(current); - - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - lockdep_softirq_enter(); - - cpu = smp_processor_id(); -restart: - /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); - - local_irq_enable(); - - h = softirq_vec; - - do { - if (pending & 1) { - unsigned int vec_nr = h - softirq_vec; - int prev_count = preempt_count(); - - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %u %s %p" - "with preempt_count %08x," - " exited with %08x?\n", vec_nr, - softirq_to_name[vec_nr], h->action, - prev_count, preempt_count()); - preempt_count() = prev_count; - } - - rcu_bh_qs(cpu); - } - h++; - pending >>= 1; - } while (pending); - - local_irq_disable(); - - pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; - - if (pending) - wakeup_softirqd(); - - lockdep_softirq_exit(); - - account_system_vtime(current); - __local_bh_enable(SOFTIRQ_OFFSET); -} - -#ifndef __ARCH_HAS_DO_SOFTIRQ - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending) - __do_softirq(); - - local_irq_restore(flags); -} - -#endif - -/* - * Enter an interrupt context. - */ -void irq_enter(void) -{ - int cpu = smp_processor_id(); - - rcu_irq_enter(); - if (idle_cpu(cpu) && !in_interrupt()) { - /* - * Prevent raise_softirq from needlessly waking up ksoftirqd - * here, as softirq will be serviced on return from interrupt. - */ - local_bh_disable(); - tick_check_idle(cpu); - _local_bh_enable(); - } - - __irq_enter(); -} - -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -static inline void invoke_softirq(void) -{ - if (!force_irqthreads) - __do_softirq(); - else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } -} -#else -static inline void invoke_softirq(void) -{ - if (!force_irqthreads) - do_softirq(); - else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } -} -#endif - -/* - * Exit an interrupt context. Process softirqs if needed and possible: - */ -void irq_exit(void) -{ - account_system_vtime(current); - trace_hardirq_exit(); - sub_preempt_count(IRQ_EXIT_OFFSET); - if (!in_interrupt() && local_softirq_pending()) - invoke_softirq(); - -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_irq_exit(); -#endif - rcu_irq_exit(); - preempt_enable_no_resched(); -} - -/* - * This function must run with irqs disabled! - */ -inline void raise_softirq_irqoff(unsigned int nr) -{ - __raise_softirq_irqoff(nr); - - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); -} - -void raise_softirq(unsigned int nr) -{ - unsigned long flags; - - local_irq_save(flags); - raise_softirq_irqoff(nr); - local_irq_restore(flags); -} - -void open_softirq(int nr, void (*action)(struct softirq_action *)) -{ - softirq_vec[nr].action = action; -} - -/* - * Tasklets - */ -struct tasklet_head -{ - struct tasklet_struct *head; - struct tasklet_struct **tail; -}; - -static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); -static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); - -void __tasklet_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__tasklet_schedule); - -void __tasklet_hi_schedule(struct tasklet_struct *t) -{ - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__tasklet_hi_schedule); - -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - BUG_ON(!irqs_disabled()); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} - -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - -static void tasklet_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - local_irq_disable(); - list = __this_cpu_read(tasklet_vec.head); - __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); - } -} - -static void tasklet_hi_action(struct softirq_action *a) -{ - struct tasklet_struct *list; - - local_irq_disable(); - list = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } -} - - -void tasklet_init(struct tasklet_struct *t, - void (*func)(unsigned long), unsigned long data) -{ - t->next = NULL; - t->state = 0; - atomic_set(&t->count, 0); - t->func = func; - t->data = data; -} - -EXPORT_SYMBOL(tasklet_init); - -void tasklet_kill(struct tasklet_struct *t) -{ - if (in_interrupt()) - printk("Attempt to kill tasklet from interrupt\n"); - - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do { - yield(); - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); - } - tasklet_unlock_wait(t); - clear_bit(TASKLET_STATE_SCHED, &t->state); -} - -EXPORT_SYMBOL(tasklet_kill); - -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ - struct tasklet_hrtimer *ttimer = - container_of(timer, struct tasklet_hrtimer, timer); - - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ - struct tasklet_hrtimer *ttimer = (void *)data; - enum hrtimer_restart restart; - - restart = ttimer->function(&ttimer->timer); - if (restart != HRTIMER_NORESTART) - hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode) -{ - hrtimer_init(&ttimer->timer, which_clock, mode); - ttimer->timer.function = __hrtimer_tasklet_trampoline; - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, - (unsigned long)ttimer); - ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); - -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = cp->priv; - - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = cp; - cp->flags = 0; - cp->priv = softirq; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - -void __init softirq_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - int i; - - per_cpu(tasklet_vec, cpu).tail = - &per_cpu(tasklet_vec, cpu).head; - per_cpu(tasklet_hi_vec, cpu).tail = - &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); - } - - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - - open_softirq(TASKLET_SOFTIRQ, tasklet_action); - open_softirq(HI_SOFTIRQ, tasklet_hi_action); -} - -static int run_ksoftirqd(void * __bind_cpu) -{ - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - preempt_disable(); - if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - } - - __set_current_state(TASK_RUNNING); - - while (local_softirq_pending()) { - /* Preempt disable stops cpu going offline. - If already offline, we'll be on wrong CPU: - don't process */ - if (cpu_is_offline((long)__bind_cpu)) - goto wait_to_die; - local_irq_disable(); - if (local_softirq_pending()) - __do_softirq(); - local_irq_enable(); - preempt_enable_no_resched(); - cond_resched(); - preempt_disable(); - rcu_note_context_switch((long)__bind_cpu); - } - preempt_enable(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; - -wait_to_die: - preempt_enable(); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -/* - * tasklet_kill_immediate is called to remove a tasklet which can already be - * scheduled for execution on @cpu. - * - * Unlike tasklet_kill, this function removes the tasklet - * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. - * - * When this function is called, @cpu must be in the CPU_DEAD state. - */ -void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) -{ - struct tasklet_struct **i; - - BUG_ON(cpu_online(cpu)); - BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); - - if (!test_bit(TASKLET_STATE_SCHED, &t->state)) - return; - - /* CPU is dead, so no lock needed. */ - for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { - if (*i == t) { - *i = t->next; - /* If this was the tail element, move the tail ptr */ - if (*i == NULL) - per_cpu(tasklet_vec, cpu).tail = i; - return; - } - } - BUG(); -} - -static void takeover_tasklets(unsigned int cpu) -{ - /* CPU is dead, so no lock needed. */ - local_irq_disable(); - - /* Find end, append list for that CPU. */ - if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { - *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; - this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); - per_cpu(tasklet_vec, cpu).head = NULL; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; - } - raise_softirq_irqoff(TASKLET_SOFTIRQ); - - if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { - *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; - __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); - per_cpu(tasklet_hi_vec, cpu).head = NULL; - per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - } - raise_softirq_irqoff(HI_SOFTIRQ); - - local_irq_enable(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - p = kthread_create_on_node(run_ksoftirqd, - hcpu, - cpu_to_node(hotcpu), - "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return notifier_from_errno(PTR_ERR(p)); - } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - cpumask_any(cpu_online_mask)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: { - static const struct sched_param param = { - .sched_priority = MAX_RT_PRIO-1 - }; - - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); - takeover_tasklets(hotcpu); - break; - } -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static __init int spawn_ksoftirqd(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - - BUG_ON(err != NOTIFY_OK); - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - return 0; -} -early_initcall(spawn_ksoftirqd); - -/* - * [ These __weak aliases are kept in a separate compilation unit, so that - * GCC does not inline them incorrectly. ] - */ - -int __init __weak early_irq_init(void) -{ - return 0; -} - -#ifdef CONFIG_GENERIC_HARDIRQS -int __init __weak arch_probe_nr_irqs(void) -{ - return NR_IRQS_LEGACY; -} - -int __init __weak arch_early_irq_init(void) -{ - return 0; -} -#endif -/* - * Copyright (2004) Linus Torvalds - * - * Author: Zwane Mwaikambo - * - * Copyright (2004, 2005) Ingo Molnar - * - * This file contains the spinlock/rwlock implementations for the - * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) - * - * Note that some architectures have special knowledge about the - * stack frames of these functions in their profile_pc. If you - * change anything significant here that could change the stack - * frame contact the architecture maintainers. - */ - -#include -#include -#include -#include -#include -#include - -/* - * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are - * not re-enabled during lock-acquire (which the preempt-spin-ops do): - */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -/* - * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h - */ -#else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) -/* - * We build the __lock_function inlines here. They are too large for - * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. - * - * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal - * towards that other CPU that it should break the lock ASAP. - */ -#define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ -{ \ - for (;;) { \ - preempt_disable(); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ -} \ - \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - for (;;) { \ - preempt_disable(); \ - local_irq_save(flags); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - local_irq_restore(flags); \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ - return flags; \ -} \ - \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ -{ \ - _raw_##op##_lock_irqsave(lock); \ -} \ - \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - /* */ \ - /* Careful: we must exclude softirqs too, hence the */ \ - /* irq-disabling. We use the generic preemption-aware */ \ - /* function: */ \ - /**/ \ - flags = _raw_##op##_lock_irqsave(lock); \ - local_bh_disable(); \ - local_irq_restore(flags); \ -} \ - -/* - * Build preemption-friendly versions of the following - * lock-spinning functions: - * - * __[spin|read|write]_lock() - * __[spin|read|write]_lock_irq() - * __[spin|read|write]_lock_irqsave() - * __[spin|read|write]_lock_bh() - */ -BUILD_LOCK_OPS(spin, raw_spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); - -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) -{ - return __raw_spin_trylock(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock); -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) -{ - return __raw_spin_trylock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock_bh); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) -{ - __raw_spin_lock(lock); -} -EXPORT_SYMBOL(_raw_spin_lock); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) -{ - return __raw_spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) -{ - __raw_spin_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) -{ - __raw_spin_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) -{ - __raw_spin_unlock(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) -{ - __raw_spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) -{ - __raw_spin_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) -{ - __raw_spin_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) -{ - return __raw_read_trylock(lock); -} -EXPORT_SYMBOL(_raw_read_trylock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) -{ - __raw_read_lock(lock); -} -EXPORT_SYMBOL(_raw_read_lock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) -{ - return __raw_read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) -{ - __raw_read_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) -{ - __raw_read_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_lock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) -{ - __raw_read_unlock(lock); -} -EXPORT_SYMBOL(_raw_read_unlock); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_read_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_read_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) -{ - __raw_read_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) -{ - __raw_read_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) -{ - return __raw_write_trylock(lock); -} -EXPORT_SYMBOL(_raw_write_trylock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) -{ - __raw_write_lock(lock); -} -EXPORT_SYMBOL(_raw_write_lock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) -{ - return __raw_write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) -{ - __raw_write_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) -{ - __raw_write_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_lock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) -{ - __raw_write_unlock(lock); -} -EXPORT_SYMBOL(_raw_write_unlock); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_write_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_write_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) -{ - __raw_write_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) -{ - __raw_write_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_bh); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nested); - -unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, - int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); - -void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, - struct lockdep_map *nest_lock) -{ - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nest_lock); - -#endif - -notrace int in_lock_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __lockfunc functions */ - extern char __lock_text_start[], __lock_text_end[]; - - return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; -} -EXPORT_SYMBOL(in_lock_functions); -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Paul McKenney - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - mutex_init(&sp->mutex); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * srcu_readers_active_idx -- returns approximate number of readers - * active on the specified rank of per-CPU counters. - */ - -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - int sum; - - sum = 0; - for_each_possible_cpu(cpu) - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; - return sum; -} - -/** - * srcu_readers_active - returns approximate number of readers. - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static int srcu_readers_active(struct srcu_struct *sp) -{ - return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - int sum; - - sum = srcu_readers_active(sp); - WARN_ON(sum); /* Leakage unless caller handles error. */ - if (sum != 0) - return; - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - preempt_disable(); - idx = sp->completed & 0x1; - barrier(); /* ensure compiler looks -once- at sp->completed. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - preempt_enable(); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - preempt_disable(); - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; - preempt_enable(); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SYNCHRONIZE_SRCU_READER_DELAY 10 - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) -{ - int idx; - - idx = sp->completed; - mutex_lock(&sp->mutex); - - /* - * Check to see if someone else did the work for us while we were - * waiting to acquire the lock. We need -two- advances of - * the counter, not just one. If there was but one, we might have - * shown up -after- our helper's first synchronize_sched(), thus - * having failed to prevent CPU-reordering races with concurrent - * srcu_read_unlock()s on other CPUs (see comment below). So we - * either (1) wait for two or (2) supply the second ourselves. - */ - - if ((sp->completed - idx) >= 2) { - mutex_unlock(&sp->mutex); - return; - } - - sync_func(); /* Force memory barrier on all CPUs. */ - - /* - * The preceding synchronize_sched() ensures that any CPU that - * sees the new value of sp->completed will also see any preceding - * changes to data structures made by this CPU. This prevents - * some other CPU from reordering the accesses in its SRCU - * read-side critical section to precede the corresponding - * srcu_read_lock() -- ensuring that such references will in - * fact be protected. - * - * So it is now safe to do the flip. - */ - - idx = sp->completed & 0x1; - sp->completed++; - - sync_func(); /* Force memory barrier on all CPUs. */ - - /* - * At this point, because of the preceding synchronize_sched(), - * all srcu_read_lock() calls using the old counters have completed. - * Their corresponding critical sections might well be still - * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. We initially give readers - * an arbitrarily chosen 10 microseconds to get out of their - * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. The 10-microsecond value has done - * very well in testing. - */ - - if (srcu_readers_active_idx(sp, idx)) - udelay(SYNCHRONIZE_SRCU_READER_DELAY); - while (srcu_readers_active_idx(sp, idx)) - schedule_timeout_interruptible(1); - - sync_func(); /* Force memory barrier on all CPUs. */ - - /* - * The preceding synchronize_sched() forces all srcu_read_unlock() - * primitives that were executing concurrently with the preceding - * for_each_possible_cpu() loop to have completed by this point. - * More importantly, it also forces the corresponding SRCU read-side - * critical sections to have also completed, and the corresponding - * references to SRCU-protected data items to be dropped. - * - * Note: - * - * Despite what you might think at first glance, the - * preceding synchronize_sched() -must- be within the - * critical section ended by the following mutex_unlock(). - * Otherwise, a task taking the early exit can race - * with a srcu_read_unlock(), which might have executed - * just before the preceding srcu_readers_active() check, - * and whose CPU might have reordered the srcu_read_unlock() - * with the preceding critical section. In this case, there - * is nothing preventing the synchronize_sched() task that is - * taking the early exit from freeing a data structure that - * is still being referenced (out of order) by the task - * doing the srcu_read_unlock(). - * - * Alternatively, the comparison with "2" on the early exit - * could be changed to "3", but this increases synchronize_srcu() - * latency for bulk loads. So the current code is preferred. - */ - - mutex_unlock(&sp->mutex); -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, synchronize_sched); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - like synchronize_srcu, but less patient - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; doing so - * will result in deadlock. However, it is perfectly legal to call - * synchronize_srcu_expedited() on one srcu_struct from some other - * srcu_struct's read-side critical section. - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, synchronize_sched_expedited); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ - -long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); -/* - * kernel/stacktrace.c - * - * Stack trace management functions - * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - */ -#include -#include -#include -#include -#include - -void print_stack_trace(struct stack_trace *trace, int spaces) -{ - int i; - - if (WARN_ON(!trace->entries)) - return; - - for (i = 0; i < trace->nr_entries; i++) { - printk("%*c", 1 + spaces, ' '); - print_ip_sym(trace->entries[i]); - } -} -EXPORT_SYMBOL_GPL(print_stack_trace); - -/* - * Architectures that do not implement save_stack_trace_tsk or - * save_stack_trace_regs get this weak alias and a once-per-bootup warning - * (whenever this facility is utilized - for example by procfs): - */ -__weak void -save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); -} - -__weak void -save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); -} -/* - * kernel/stop_machine.c - * - * Copyright (C) 2008, 2005 IBM Corporation. - * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au - * Copyright (C) 2010 SUSE Linux Products GmbH - * Copyright (C) 2010 Tejun Heo - * - * This file is released under the GPLv2 and any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Structure to determine completion condition and record errors. May - * be shared by works on different cpus. - */ -struct cpu_stop_done { - atomic_t nr_todo; /* nr left to execute */ - bool executed; /* actually executed? */ - int ret; /* collected return value */ - struct completion completion; /* fired if nr_todo reaches 0 */ -}; - -/* the actual stopper, one per every possible cpu, enabled on online cpus */ -struct cpu_stopper { - spinlock_t lock; - bool enabled; /* is this stopper enabled? */ - struct list_head works; /* list of pending works */ - struct task_struct *thread; /* stopper thread */ -}; - -static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -static bool stop_machine_initialized = false; - -static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) -{ - memset(done, 0, sizeof(*done)); - atomic_set(&done->nr_todo, nr_todo); - init_completion(&done->completion); -} - -/* signal completion unless @done is NULL */ -static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) -{ - if (done) { - if (executed) - done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) - complete(&done->completion); - } -} - -/* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) -{ - unsigned long flags; - - spin_lock_irqsave(&stopper->lock, flags); - - if (stopper->enabled) { - list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); - } else - cpu_stop_signal_done(work->done, false); - - spin_unlock_irqrestore(&stopper->lock, flags); -} - -/** - * stop_one_cpu - stop a cpu - * @cpu: cpu to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Execute @fn(@arg) on @cpu. @fn is run in a process context with - * the highest priority preempting any task on the cpu and - * monopolizing it. This function returns after the execution is - * complete. - * - * This function doesn't guarantee @cpu stays online till @fn - * completes. If @cpu goes down in the middle, execution may happen - * partially or fully on different cpus. @fn should either be ready - * for that or the caller should ensure that @cpu stays online until - * this function completes. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -ENOENT if @fn(@arg) was not executed because @cpu was offline; - * otherwise, the return value of @fn. - */ -int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) -{ - struct cpu_stop_done done; - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; - - cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); - wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; -} - -/** - * stop_one_cpu_nowait - stop a cpu but don't wait for completion - * @cpu: cpu to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Similar to stop_one_cpu() but doesn't wait for completion. The - * caller is responsible for ensuring @work_buf is currently unused - * and will remain untouched until stopper starts executing @fn. - * - * CONTEXT: - * Don't care. - */ -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, - struct cpu_stop_work *work_buf) -{ - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); -} - -/* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); - -static void queue_stop_cpus_work(const struct cpumask *cpumask, - cpu_stop_fn_t fn, void *arg, - struct cpu_stop_done *done) -{ - struct cpu_stop_work *work; - unsigned int cpu; - - /* initialize works and done */ - for_each_cpu(cpu, cpumask) { - work = &per_cpu(stop_cpus_work, cpu); - work->fn = fn; - work->arg = arg; - work->done = done; - } - - /* - * Disable preemption while queueing to avoid getting - * preempted by a stopper which might wait for other stoppers - * to enter @fn which can lead to deadlock. - */ - preempt_disable(); - for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), - &per_cpu(stop_cpus_work, cpu)); - preempt_enable(); -} - -static int __stop_cpus(const struct cpumask *cpumask, - cpu_stop_fn_t fn, void *arg) -{ - struct cpu_stop_done done; - - cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done); - wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; -} - -/** - * stop_cpus - stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, - * @fn is run in a process context with the highest priority - * preempting any task on the cpu and monopolizing it. This function - * returns after all executions are complete. - * - * This function doesn't guarantee the cpus in @cpumask stay online - * till @fn completes. If some cpus go down in the middle, execution - * on the cpu may happen partially or fully on different cpus. @fn - * should either be ready for that or the caller should ensure that - * the cpus stay online until this function completes. - * - * All stop_cpus() calls are serialized making it safe for @fn to wait - * for all cpus to start executing it. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -ENOENT if @fn(@arg) was not executed at all because all cpus in - * @cpumask were offline; otherwise, 0 if all executions of @fn - * returned 0, any non zero return value if any returned non zero. - */ -int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - mutex_lock(&stop_cpus_mutex); - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - -/** - * try_stop_cpus - try to stop multiple cpus - * @cpumask: cpus to stop - * @fn: function to execute - * @arg: argument to @fn - * - * Identical to stop_cpus() except that it fails with -EAGAIN if - * someone else is already using the facility. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * -EAGAIN if someone else is already stopping cpus, -ENOENT if - * @fn(@arg) was not executed at all because all cpus in @cpumask were - * offline; otherwise, 0 if all executions of @fn returned 0, any non - * zero return value if any returned non zero. - */ -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) -{ - int ret; - - /* static works are used, process one request at a time */ - if (!mutex_trylock(&stop_cpus_mutex)) - return -EAGAIN; - ret = __stop_cpus(cpumask, fn, arg); - mutex_unlock(&stop_cpus_mutex); - return ret; -} - -static int cpu_stopper_thread(void *data) -{ - struct cpu_stopper *stopper = data; - struct cpu_stop_work *work; - int ret; - -repeat: - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - work = NULL; - spin_lock_irq(&stopper->lock); - if (!list_empty(&stopper->works)) { - work = list_first_entry(&stopper->works, - struct cpu_stop_work, list); - list_del_init(&work->list); - } - spin_unlock_irq(&stopper->lock); - - if (work) { - cpu_stop_fn_t fn = work->fn; - void *arg = work->arg; - struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - - __set_current_state(TASK_RUNNING); - - /* cpu stop callbacks are not allowed to sleep */ - preempt_disable(); - - ret = fn(arg); - if (ret) - done->ret = ret; - - /* restore preemption and check it's still balanced */ - preempt_enable(); - WARN_ONCE(preempt_count(), - "cpu_stop: %s(%p) leaked preempt count\n", - kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, - ksym_buf), arg); - - cpu_stop_signal_done(done, true); - } else - schedule(); - - goto repeat; -} - -extern void sched_set_stop_task(int cpu, struct task_struct *stop); - -/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ -static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - BUG_ON(stopper->thread || stopper->enabled || - !list_empty(&stopper->works)); - p = kthread_create_on_node(cpu_stopper_thread, - stopper, - cpu_to_node(cpu), - "migration/%d", cpu); - if (IS_ERR(p)) - return notifier_from_errno(PTR_ERR(p)); - get_task_struct(p); - kthread_bind(p, cpu); - sched_set_stop_task(cpu, p); - stopper->thread = p; - break; - - case CPU_ONLINE: - /* strictly unnecessary, as first user will wake it */ - wake_up_process(stopper->thread); - /* mark enabled */ - spin_lock_irq(&stopper->lock); - stopper->enabled = true; - spin_unlock_irq(&stopper->lock); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_POST_DEAD: - { - struct cpu_stop_work *work; - - sched_set_stop_task(cpu, NULL); - /* kill the stopper */ - kthread_stop(stopper->thread); - /* drain remaining works */ - spin_lock_irq(&stopper->lock); - list_for_each_entry(work, &stopper->works, list) - cpu_stop_signal_done(work->done, false); - stopper->enabled = false; - spin_unlock_irq(&stopper->lock); - /* release the stopper */ - put_task_struct(stopper->thread); - stopper->thread = NULL; - break; - } -#endif - } - - return NOTIFY_OK; -} - -/* - * Give it a higher priority so that cpu stopper is available to other - * cpu notifiers. It currently shares the same priority as sched - * migration_notifier. - */ -static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { - .notifier_call = cpu_stop_cpu_callback, - .priority = 10, -}; - -static int __init cpu_stop_init(void) -{ - void *bcpu = (void *)(long)smp_processor_id(); - unsigned int cpu; - int err; - - for_each_possible_cpu(cpu) { - struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - - spin_lock_init(&stopper->lock); - INIT_LIST_HEAD(&stopper->works); - } - - /* start one for the boot cpu */ - err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, - bcpu); - BUG_ON(err != NOTIFY_OK); - cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); - register_cpu_notifier(&cpu_stop_cpu_notifier); - - stop_machine_initialized = true; - - return 0; -} -early_initcall(cpu_stop_init); - -#ifdef CONFIG_STOP_MACHINE - -/* This controls the threads on each CPU. */ -enum stopmachine_state { - /* Dummy starting state for thread. */ - STOPMACHINE_NONE, - /* Awaiting everyone to be scheduled. */ - STOPMACHINE_PREPARE, - /* Disable interrupts. */ - STOPMACHINE_DISABLE_IRQ, - /* Run the function */ - STOPMACHINE_RUN, - /* Exit */ - STOPMACHINE_EXIT, -}; - -struct stop_machine_data { - int (*fn)(void *); - void *data; - /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ - unsigned int num_threads; - const struct cpumask *active_cpus; - - enum stopmachine_state state; - atomic_t thread_ack; -}; - -static void set_state(struct stop_machine_data *smdata, - enum stopmachine_state newstate) -{ - /* Reset ack counter. */ - atomic_set(&smdata->thread_ack, smdata->num_threads); - smp_wmb(); - smdata->state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(struct stop_machine_data *smdata) -{ - if (atomic_dec_and_test(&smdata->thread_ack)) - set_state(smdata, smdata->state + 1); -} - -/* This is the cpu_stop function which stops the CPU. */ -static int stop_machine_cpu_stop(void *data) -{ - struct stop_machine_data *smdata = data; - enum stopmachine_state curstate = STOPMACHINE_NONE; - int cpu = smp_processor_id(), err = 0; - unsigned long flags; - bool is_active; - - /* - * When called from stop_machine_from_inactive_cpu(), irq might - * already be disabled. Save the state and restore it on exit. - */ - local_save_flags(flags); - - if (!smdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, smdata->active_cpus); - - /* Simple state machine */ - do { - /* Chill out and ensure we re-read stopmachine_state. */ - cpu_relax(); - if (smdata->state != curstate) { - curstate = smdata->state; - switch (curstate) { - case STOPMACHINE_DISABLE_IRQ: - local_irq_disable(); - hard_irq_disable(); - break; - case STOPMACHINE_RUN: - if (is_active) - err = smdata->fn(smdata->data); - break; - default: - break; - } - ack_state(smdata); - } - } while (curstate != STOPMACHINE_EXIT); - - local_irq_restore(flags); - return err; -} - -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) -{ - struct stop_machine_data smdata = { .fn = fn, .data = data, - .num_threads = num_online_cpus(), - .active_cpus = cpus }; - - if (!stop_machine_initialized) { - /* - * Handle the case where stop_machine() is called - * early in boot before stop_machine() has been - * initialized. - */ - unsigned long flags; - int ret; - - WARN_ON_ONCE(smdata.num_threads != 1); - - local_irq_save(flags); - hard_irq_disable(); - ret = (*fn)(data); - local_irq_restore(flags); - - return ret; - } - - /* Set the initial state and stop all online cpus. */ - set_state(&smdata, STOPMACHINE_PREPARE); - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); -} - -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) -{ - int ret; - - /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); - return ret; -} -EXPORT_SYMBOL_GPL(stop_machine); - -/** - * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU - * @fn: the function to run - * @data: the data ptr for the @fn() - * @cpus: the cpus to run the @fn() on (NULL = any online cpu) - * - * This is identical to stop_machine() but can be called from a CPU which - * is not active. The local CPU is in the process of hotplug (so no other - * CPU hotplug can start) and not marked active and doesn't have enough - * context to sleep. - * - * This function provides stop_machine() functionality for such state by - * using busy-wait for synchronization and executing @fn directly for local - * CPU. - * - * CONTEXT: - * Local CPU is inactive. Temporarily stops all active CPUs. - * - * RETURNS: - * 0 if all executions of @fn returned 0, any non zero return value if any - * returned non zero. - */ -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, - const struct cpumask *cpus) -{ - struct stop_machine_data smdata = { .fn = fn, .data = data, - .active_cpus = cpus }; - struct cpu_stop_done done; - int ret; - - /* Local CPU must be inactive and CPU hotplug in progress. */ - BUG_ON(cpu_active(raw_smp_processor_id())); - smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ - - /* No proper task established and can't sleep - busy wait for lock. */ - while (!mutex_trylock(&stop_cpus_mutex)) - cpu_relax(); - - /* Schedule work on other CPUs and execute directly for local CPU */ - set_state(&smdata, STOPMACHINE_PREPARE); - cpu_stop_init_done(&done, num_active_cpus()); - queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, - &done); - ret = stop_machine_cpu_stop(&smdata); - - /* Busy wait for completion. */ - while (!completion_done(&done.completion)) - cpu_relax(); - - mutex_unlock(&stop_cpus_mutex); - return ret ?: done.ret; -} - -#endif /* CONFIG_STOP_MACHINE */ -/* - * linux/kernel/sys.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -/* Move somewhere else to avoid recompiling? */ -#include - -#include -#include -#include - -#ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) -#endif -#ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) -#endif -#ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) -#endif -#ifndef GET_TSC_CTL -# define GET_TSC_CTL(a) (-EINVAL) -#endif -#ifndef SET_TSC_CTL -# define SET_TSC_CTL(a) (-EINVAL) -#endif - -/* - * this is where the system-wide overflow UID and GID are defined, for - * architectures that now have 32-bit UID/GID but didn't in the past - */ - -int overflowuid = DEFAULT_OVERFLOWUID; -int overflowgid = DEFAULT_OVERFLOWGID; - -#ifdef CONFIG_UID16 -EXPORT_SYMBOL(overflowuid); -EXPORT_SYMBOL(overflowgid); -#endif - -/* - * the same as above, but for filesystems which can only store a 16-bit - * UID and GID. as such, this is needed on all architectures - */ - -int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; - -EXPORT_SYMBOL(fs_overflowuid); -EXPORT_SYMBOL(fs_overflowgid); - -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -/* - * Returns true if current's euid is same as p's uid or euid, - * or has CAP_SYS_NICE to p's user_ns. - * - * Called with rcu_read_lock, creds are safe - */ -static bool set_one_prio_perm(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred = __task_cred(p); - - if (pcred->user->user_ns == cred->user->user_ns && - (pcred->uid == cred->euid || - pcred->euid == cred->euid)) - return true; - if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) - return true; - return false; -} - -/* - * set the priority of a task - * - the caller must hold the RCU read lock - */ -static int set_one_prio(struct task_struct *p, int niceval, int error) -{ - int no_nice; - - if (!set_one_prio_perm(p)) { - error = -EPERM; - goto out; - } - if (niceval < task_nice(p) && !can_nice(p, niceval)) { - error = -EACCES; - goto out; - } - no_nice = security_task_setnice(p, niceval); - if (no_nice) { - error = no_nice; - goto out; - } - if (error == -ESRCH) - error = 0; - set_user_nice(p, niceval); -out: - return error; -} - -SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) -{ - struct task_struct *g, *p; - struct user_struct *user; - const struct cred *cred = current_cred(); - int error = -EINVAL; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - goto out; - - /* normalize: avoid signed division (rounding problems) */ - error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; - - rcu_read_lock(); - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = (struct user_struct *) cred->user; - if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (__task_cred(p)->uid == who) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (who != cred->uid) - free_uid(user); /* For find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); - rcu_read_unlock(); -out: - return error; -} - -/* - * Ugh. To avoid negative return values, "getpriority()" will - * not return the normal nice-value, but a negated value that - * has been offset by 20 (ie it returns 40..1 instead of -20..19) - * to stay compatible. - */ -SYSCALL_DEFINE2(getpriority, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - const struct cred *cred = current_cred(); - long niceval, retval = -ESRCH; - struct pid *pgrp; - - if (which > PRIO_USER || which < PRIO_PROCESS) - return -EINVAL; - - rcu_read_lock(); - read_lock(&tasklist_lock); - switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - user = (struct user_struct *) cred->user; - if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (__task_cred(p)->uid == who) { - niceval = 20 - task_nice(p); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (who != cred->uid) - free_uid(user); /* for find_user() */ - break; - } -out_unlock: - read_unlock(&tasklist_lock); - rcu_read_unlock(); - - return retval; -} - -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); - syscore_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - disable_nonboot_cpus(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - -/* - * Unprivileged users may change the real gid to the effective gid - * or vice versa. (BSD-style) - * - * If you set the real gid at all, or set the effective gid to a value not - * equal to the real gid, then the saved gid is set to the new effective gid. - * - * This makes it possible for a setgid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setregid() will be - * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. - * - * SMP: There are not races, the GIDs are checked only by filesystem - * operations (as far as semantic preservation is concerned). - */ -SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (rgid != (gid_t) -1) { - if (old->gid == rgid || - old->egid == rgid || - nsown_capable(CAP_SETGID)) - new->gid = rgid; - else - goto error; - } - if (egid != (gid_t) -1) { - if (old->gid == egid || - old->egid == egid || - old->sgid == egid || - nsown_capable(CAP_SETGID)) - new->egid = egid; - else - goto error; - } - - if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) - new->sgid = new->egid; - new->fsgid = new->egid; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * setgid() is implemented like SysV w/ SAVED_IDS - * - * SMP: Same implicit races as above. - */ -SYSCALL_DEFINE1(setgid, gid_t, gid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (nsown_capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) - new->egid = new->fsgid = gid; - else - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * change the user struct in a credentials set to match the new UID - */ -static int set_user(struct cred *new) -{ - struct user_struct *new_user; - - new_user = alloc_uid(current_user_ns(), new->uid); - if (!new_user) - return -EAGAIN; - - /* - * We don't fail in case of NPROC limit excess here because too many - * poorly written programs don't check set*uid() return code, assuming - * it never fails if called by root. We may still enforce NPROC limit - * for programs doing set*uid()+execve() by harmlessly deferring the - * failure to the execve() stage. - */ - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && - new_user != INIT_USER) - current->flags |= PF_NPROC_EXCEEDED; - else - current->flags &= ~PF_NPROC_EXCEEDED; - - free_uid(new->user); - new->user = new_user; - return 0; -} - -/* - * Unprivileged users may change the real uid to the effective uid - * or vice versa. (BSD-style) - * - * If you set the real uid at all, or set the effective uid to a value not - * equal to the real uid, then the saved uid is set to the new effective uid. - * - * This makes it possible for a setuid program to completely drop its - * privileges, which is often a useful assertion to make when you are doing - * a security audit over a program. - * - * The general idea is that a program which uses just setreuid() will be - * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. - */ -SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (old->uid != ruid && - old->euid != ruid && - !nsown_capable(CAP_SETUID)) - goto error; - } - - if (euid != (uid_t) -1) { - new->euid = euid; - if (old->uid != euid && - old->euid != euid && - old->suid != euid && - !nsown_capable(CAP_SETUID)) - goto error; - } - - if (new->uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old->uid)) - new->suid = new->euid; - new->fsuid = new->euid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_RE); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -/* - * setuid() is implemented like SysV with SAVED_IDS - * - * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal - * user and then switch back, because if you're root, setuid() sets - * the saved uid too. If you don't like this, blame the bright people - * in the POSIX committee and/or USG. Note that the BSD-style setreuid() - * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. - */ -SYSCALL_DEFINE1(setuid, uid_t, uid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (nsown_capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - } else if (uid != old->uid && uid != new->suid) { - goto error; - } - - new->fsuid = new->euid = uid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_ID); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - - -/* - * This function implements a generic ability to update ruid, euid, - * and suid. This allows you to implement the 4.4 compatible seteuid(). - */ -SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - old = current_cred(); - - retval = -EPERM; - if (!nsown_capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) - goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) - goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) - goto error; - } - - if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { - retval = set_user(new); - if (retval < 0) - goto error; - } - } - if (euid != (uid_t) -1) - new->euid = euid; - if (suid != (uid_t) -1) - new->suid = suid; - new->fsuid = new->euid; - - retval = security_task_fix_setuid(new, old, LSM_SETID_RES); - if (retval < 0) - goto error; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) - retval = put_user(cred->suid, suid); - - return retval; -} - -/* - * Same as above, but for rgid, egid, sgid. - */ -SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) -{ - const struct cred *old; - struct cred *new; - int retval; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!nsown_capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) - goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) - goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) - goto error; - } - - if (rgid != (gid_t) -1) - new->gid = rgid; - if (egid != (gid_t) -1) - new->egid = egid; - if (sgid != (gid_t) -1) - new->sgid = sgid; - new->fsgid = new->egid; - - return commit_creds(new); - -error: - abort_creds(new); - return retval; -} - -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) - retval = put_user(cred->sgid, sgid); - - return retval; -} - - -/* - * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This - * is used for "access()" and for the NFS daemon (letting nfsd stay at - * whatever uid it wants to). It normally shadows "euid", except when - * explicitly set by setfsuid() or for access.. - */ -SYSCALL_DEFINE1(setfsuid, uid_t, uid) -{ - const struct cred *old; - struct cred *new; - uid_t old_fsuid; - - new = prepare_creds(); - if (!new) - return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; - - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || - nsown_capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; - if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) - goto change_okay; - } - } - - abort_creds(new); - return old_fsuid; - -change_okay: - commit_creds(new); - return old_fsuid; -} - -/* - * Samma pÃ¥ svenska.. - */ -SYSCALL_DEFINE1(setfsgid, gid_t, gid) -{ - const struct cred *old; - struct cred *new; - gid_t old_fsgid; - - new = prepare_creds(); - if (!new) - return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; - - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || - nsown_capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; - goto change_okay; - } - } - - abort_creds(new); - return old_fsgid; - -change_okay: - commit_creds(new); - return old_fsgid; -} - -void do_sys_times(struct tms *tms) -{ - cputime_t tgutime, tgstime, cutime, cstime; - - spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); - cutime = current->signal->cutime; - cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(tgutime); - tms->tms_stime = cputime_to_clock_t(tgstime); - tms->tms_cutime = cputime_to_clock_t(cutime); - tms->tms_cstime = cputime_to_clock_t(cstime); -} - -SYSCALL_DEFINE1(times, struct tms __user *, tbuf) -{ - if (tbuf) { - struct tms tmp; - - do_sys_times(&tmp); - if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) - return -EFAULT; - } - force_successful_syscall_return(); - return (long) jiffies_64_to_clock_t(get_jiffies_64()); -} - -/* - * This needs some heavy checking ... - * I just haven't the stomach for it. I also don't fully - * understand sessions/pgrp etc. Let somebody who does explain it. - * - * OK, I think I have the protection semantics right.... this is really - * only important on a multi-user system anyway, to make sure one user - * can't send a signal to a process owned by another. -TYT, 12/12/91 - * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 - */ -SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) -{ - struct task_struct *p; - struct task_struct *group_leader = current->group_leader; - struct pid *pgrp; - int err; - - if (!pid) - pid = task_pid_vnr(group_leader); - if (!pgid) - pgid = pid; - if (pgid < 0) - return -EINVAL; - rcu_read_lock(); - - /* From this point forward we keep holding onto the tasklist lock - * so that our parent does not change from under us. -DaveM - */ - write_lock_irq(&tasklist_lock); - - err = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - - err = -EINVAL; - if (!thread_group_leader(p)) - goto out; - - if (same_thread_group(p->real_parent, group_leader)) { - err = -EPERM; - if (task_session(p) != task_session(group_leader)) - goto out; - err = -EACCES; - if (p->did_exec) - goto out; - } else { - err = -ESRCH; - if (p != group_leader) - goto out; - } - - err = -EPERM; - if (p->signal->leader) - goto out; - - pgrp = task_pid(p); - if (pgid != pid) { - struct task_struct *g; - - pgrp = find_vpid(pgid); - g = pid_task(pgrp, PIDTYPE_PGID); - if (!g || task_session(g) != task_session(group_leader)) - goto out; - } - - err = security_task_setpgid(p, pgid); - if (err) - goto out; - - if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); - - err = 0; -out: - /* All paths lead to here, thus we are safe. -DaveM */ - write_unlock_irq(&tasklist_lock); - rcu_read_unlock(); - return err; -} - -SYSCALL_DEFINE1(getpgid, pid_t, pid) -{ - struct task_struct *p; - struct pid *grp; - int retval; - - rcu_read_lock(); - if (!pid) - grp = task_pgrp(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - grp = task_pgrp(p); - if (!grp) - goto out; - - retval = security_task_getpgid(p); - if (retval) - goto out; - } - retval = pid_vnr(grp); -out: - rcu_read_unlock(); - return retval; -} - -#ifdef __ARCH_WANT_SYS_GETPGRP - -SYSCALL_DEFINE0(getpgrp) -{ - return sys_getpgid(0); -} - -#endif - -SYSCALL_DEFINE1(getsid, pid_t, pid) -{ - struct task_struct *p; - struct pid *sid; - int retval; - - rcu_read_lock(); - if (!pid) - sid = task_session(current); - else { - retval = -ESRCH; - p = find_task_by_vpid(pid); - if (!p) - goto out; - sid = task_session(p); - if (!sid) - goto out; - - retval = security_task_getsid(p); - if (retval) - goto out; - } - retval = pid_vnr(sid); -out: - rcu_read_unlock(); - return retval; -} - -SYSCALL_DEFINE0(setsid) -{ - struct task_struct *group_leader = current->group_leader; - struct pid *sid = task_pid(group_leader); - pid_t session = pid_vnr(sid); - int err = -EPERM; - - write_lock_irq(&tasklist_lock); - /* Fail if I am already a session leader */ - if (group_leader->signal->leader) - goto out; - - /* Fail if a process group id already exists that equals the - * proposed session id. - */ - if (pid_task(sid, PIDTYPE_PGID)) - goto out; - - group_leader->signal->leader = 1; - __set_special_pids(sid); - - proc_clear_tty(group_leader); - - err = session; -out: - write_unlock_irq(&tasklist_lock); - if (err > 0) { - proc_sid_connector(group_leader); - sched_autogroup_create_attach(group_leader); - } - return err; -} - -DECLARE_RWSEM(uts_sem); - -#ifdef COMPAT_UTS_MACHINE -#define override_architecture(name) \ - (personality(current->personality) == PER_LINUX32 && \ - copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ - sizeof(COMPAT_UTS_MACHINE))) -#else -#define override_architecture(name) 0 -#endif - -/* - * Work around broken programs that cannot handle "Linux 3.0". - * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 - */ -static int override_release(char __user *release, int len) -{ - int ret = 0; - char buf[65]; - - if (current->personality & UNAME26) { - char *rest = UTS_RELEASE; - int ndots = 0; - unsigned v; - - while (*rest) { - if (*rest == '.' && ++ndots >= 3) - break; - if (!isdigit(*rest) && *rest != '.') - break; - rest++; - } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - snprintf(buf, len, "2.6.%u%s", v, rest); - ret = copy_to_user(release, buf, len); - } - return ret; -} - -SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) -{ - int errno = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; - up_read(&uts_sem); - - if (!errno && override_release(name->release, sizeof(name->release))) - errno = -EFAULT; - if (!errno && override_architecture(name)) - errno = -EFAULT; - return errno; -} - -#ifdef __ARCH_WANT_SYS_OLD_UNAME -/* - * Old cruft - */ -SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) -{ - int error = 0; - - if (!name) - return -EFAULT; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - error = -EFAULT; - up_read(&uts_sem); - - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - return error; -} - -SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - up_read(&uts_sem); - - if (!error && override_architecture(name)) - error = -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - return error ? -EFAULT : 0; -} -#endif - -SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); - - memcpy(u->nodename, tmp, len); - memset(u->nodename + len, 0, sizeof(u->nodename) - len); - errno = 0; - } - uts_proc_notify(UTS_PROC_HOSTNAME); - up_write(&uts_sem); - return errno; -} - -#ifdef __ARCH_WANT_SYS_GETHOSTNAME - -SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) -{ - int i, errno; - struct new_utsname *u; - - if (len < 0) - return -EINVAL; - down_read(&uts_sem); - u = utsname(); - i = 1 + strlen(u->nodename); - if (i > len) - i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; - up_read(&uts_sem); - return errno; -} - -#endif - -/* - * Only setdomainname; getdomainname can be implemented by calling - * uname() - */ -SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) -{ - int errno; - char tmp[__NEW_UTS_LEN]; - - if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - if (len < 0 || len > __NEW_UTS_LEN) - return -EINVAL; - - down_write(&uts_sem); - errno = -EFAULT; - if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); - - memcpy(u->domainname, tmp, len); - memset(u->domainname + len, 0, sizeof(u->domainname) - len); - errno = 0; - } - uts_proc_notify(UTS_PROC_DOMAINNAME); - up_write(&uts_sem); - return errno; -} - -SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - struct rlimit value; - int ret; - - ret = do_prlimit(current, resource, NULL, &value); - if (!ret) - ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - - return ret; -} - -#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT - -/* - * Back compatibility for getrlimit. Needed for some apps. - */ - -SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct rlimit __user *, rlim) -{ - struct rlimit x; - if (resource >= RLIM_NLIMITS) - return -EINVAL; - - task_lock(current->group_leader); - x = current->signal->rlim[resource]; - task_unlock(current->group_leader); - if (x.rlim_cur > 0x7FFFFFFF) - x.rlim_cur = 0x7FFFFFFF; - if (x.rlim_max > 0x7FFFFFFF) - x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; -} - -#endif - -static inline bool rlim64_is_infinity(__u64 rlim64) -{ -#if BITS_PER_LONG < 64 - return rlim64 >= ULONG_MAX; -#else - return rlim64 == RLIM64_INFINITY; -#endif -} - -static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) -{ - if (rlim->rlim_cur == RLIM_INFINITY) - rlim64->rlim_cur = RLIM64_INFINITY; - else - rlim64->rlim_cur = rlim->rlim_cur; - if (rlim->rlim_max == RLIM_INFINITY) - rlim64->rlim_max = RLIM64_INFINITY; - else - rlim64->rlim_max = rlim->rlim_max; -} - -static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) -{ - if (rlim64_is_infinity(rlim64->rlim_cur)) - rlim->rlim_cur = RLIM_INFINITY; - else - rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; - if (rlim64_is_infinity(rlim64->rlim_max)) - rlim->rlim_max = RLIM_INFINITY; - else - rlim->rlim_max = (unsigned long)rlim64->rlim_max; -} - -/* make sure you are allowed to change @tsk limits before calling this */ -int do_prlimit(struct task_struct *tsk, unsigned int resource, - struct rlimit *new_rlim, struct rlimit *old_rlim) -{ - struct rlimit *rlim; - int retval = 0; - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - if (new_rlim) { - if (new_rlim->rlim_cur > new_rlim->rlim_max) - return -EINVAL; - if (resource == RLIMIT_NOFILE && - new_rlim->rlim_max > sysctl_nr_open) - return -EPERM; - } - - /* protect tsk->signal and tsk->sighand from disappearing */ - read_lock(&tasklist_lock); - if (!tsk->sighand) { - retval = -ESRCH; - goto out; - } - - rlim = tsk->signal->rlim + resource; - task_lock(tsk->group_leader); - if (new_rlim) { - /* Keep the capable check against init_user_ns until - cgroups can contain all limits */ - if (new_rlim->rlim_max > rlim->rlim_max && - !capable(CAP_SYS_RESOURCE)) - retval = -EPERM; - if (!retval) - retval = security_task_setrlimit(tsk->group_leader, - resource, new_rlim); - if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim->rlim_cur = 1; - } - } - if (!retval) { - if (old_rlim) - *old_rlim = *rlim; - if (new_rlim) - *rlim = *new_rlim; - } - task_unlock(tsk->group_leader); - - /* - * RLIMIT_CPU handling. Note that the kernel fails to return an error - * code if it rejected the user's attempt to set RLIMIT_CPU. This is a - * very long-standing error, and fixing it now risks breakage of - * applications, so we live with it - */ - if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY) - update_rlimit_cpu(tsk, new_rlim->rlim_cur); -out: - read_unlock(&tasklist_lock); - return retval; -} - -/* rcu lock must be held */ -static int check_prlimit_permission(struct task_struct *task) -{ - const struct cred *cred = current_cred(), *tcred; - - if (current == task) - return 0; - - tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) - return 0; - if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) - return 0; - - return -EPERM; -} - -SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, - const struct rlimit64 __user *, new_rlim, - struct rlimit64 __user *, old_rlim) -{ - struct rlimit64 old64, new64; - struct rlimit old, new; - struct task_struct *tsk; - int ret; - - if (new_rlim) { - if (copy_from_user(&new64, new_rlim, sizeof(new64))) - return -EFAULT; - rlim64_to_rlim(&new64, &new); - } - - rcu_read_lock(); - tsk = pid ? find_task_by_vpid(pid) : current; - if (!tsk) { - rcu_read_unlock(); - return -ESRCH; - } - ret = check_prlimit_permission(tsk); - if (ret) { - rcu_read_unlock(); - return ret; - } - get_task_struct(tsk); - rcu_read_unlock(); - - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); - - if (!ret && old_rlim) { - rlim_to_rlim64(&old, &old64); - if (copy_to_user(old_rlim, &old64, sizeof(old64))) - ret = -EFAULT; - } - - put_task_struct(tsk); - return ret; -} - -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) -{ - struct rlimit new_rlim; - - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - return do_prlimit(current, resource, &new_rlim, NULL); -} - -/* - * It would make sense to put struct rusage in the task_struct, - * except that would make the task_struct be *really big*. After - * task_struct gets moved into malloc'ed memory, it would - * make sense to do this. It will make moving the rest of the information - * a lot simpler! (Which we're not doing right now because we're not - * measuring them yet). - * - * When sampling multiple threads for RUSAGE_SELF, under SMP we might have - * races with threads incrementing their own counters. But since word - * reads are atomic, we either get new values or old values and we don't - * care which for the sums. We always take the siglock to protect reading - * the c* fields from p->signal from races with exit.c updating those - * fields when reaping, so a sample either gets all the additions of a - * given child after it's reaped, or none so this sample is before reaping. - * - * Locking: - * We need to take the siglock for CHILDEREN, SELF and BOTH - * for the cases current multithreaded, non-current single threaded - * non-current multithreaded. Thread traversal is now safe with - * the siglock held. - * Strictly speaking, we donot need to take the siglock if we are current and - * single threaded, as no one else can take our signal_struct away, no one - * else can reap the children to update signal->c* counters, and no one else - * can race with the signal-> fields. If we do not take any lock, the - * signal-> fields could be read out of order while another thread was just - * exiting. So we should place a read memory barrier when we avoid the lock. - * On the writer side, write memory barrier is implied in __exit_signal - * as __exit_signal releases the siglock spinlock after updating the signal-> - * fields. But we don't do this yet to keep things simple. - * - */ - -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) -{ - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); -} - -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) -{ - struct task_struct *t; - unsigned long flags; - cputime_t tgutime, tgstime, utime, stime; - unsigned long maxrss = 0; - - memset((char *) r, 0, sizeof *r); - utime = stime = 0; - - if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); - accumulate_thread_rusage(p, r); - maxrss = p->signal->maxrss; - goto out; - } - - if (!lock_task_sighand(p, &flags)) - return; - - switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); - break; - - default: - BUG(); - } - unlock_task_sighand(p, &flags); - -out: - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); - - if (who != RUSAGE_CHILDREN) { - struct mm_struct *mm = get_task_mm(p); - if (mm) { - setmax_mm_hiwater_rss(&maxrss, mm); - mmput(mm); - } - } - r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ -} - -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) -{ - struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && - who != RUSAGE_THREAD) - return -EINVAL; - return getrusage(current, who, ru); -} - -SYSCALL_DEFINE1(umask, int, mask) -{ - mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); - return mask; -} - -#ifdef CONFIG_CHECKPOINT_RESTORE -static int prctl_set_mm(int opt, unsigned long addr, - unsigned long arg4, unsigned long arg5) -{ - unsigned long rlim = rlimit(RLIMIT_DATA); - unsigned long vm_req_flags; - unsigned long vm_bad_flags; - struct vm_area_struct *vma; - int error = 0; - struct mm_struct *mm = current->mm; - - if (arg4 | arg5) - return -EINVAL; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - if (addr >= TASK_SIZE) - return -EINVAL; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, addr); - - if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { - /* It must be existing VMA */ - if (!vma || vma->vm_start > addr) - goto out; - } - - error = -EINVAL; - switch (opt) { - case PR_SET_MM_START_CODE: - case PR_SET_MM_END_CODE: - vm_req_flags = VM_READ | VM_EXEC; - vm_bad_flags = VM_WRITE | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_CODE) - mm->start_code = addr; - else - mm->end_code = addr; - break; - - case PR_SET_MM_START_DATA: - case PR_SET_MM_END_DATA: - vm_req_flags = VM_READ | VM_WRITE; - vm_bad_flags = VM_EXEC | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_DATA) - mm->start_data = addr; - else - mm->end_data = addr; - break; - - case PR_SET_MM_START_STACK: - -#ifdef CONFIG_STACK_GROWSUP - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; -#else - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; -#endif - if ((vma->vm_flags & vm_req_flags) != vm_req_flags) - goto out; - - mm->start_stack = addr; - break; - - case PR_SET_MM_START_BRK: - if (addr <= mm->end_data) - goto out; - - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) - goto out; - - mm->start_brk = addr; - break; - - case PR_SET_MM_BRK: - if (addr <= mm->end_data) - goto out; - - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) - goto out; - - mm->brk = addr; - break; - - default: - error = -EINVAL; - goto out; - } - - error = 0; - -out: - up_read(&mm->mmap_sem); - - return error; -} -#else /* CONFIG_CHECKPOINT_RESTORE */ -static int prctl_set_mm(int opt, unsigned long addr, - unsigned long arg4, unsigned long arg5) -{ - return -EINVAL; -} -#endif - -SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, - unsigned long, arg4, unsigned long, arg5) -{ - struct task_struct *me = current; - unsigned char comm[sizeof(me->comm)]; - long error; - - error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error != -ENOSYS) - return error; - - error = 0; - switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - me->pdeath_signal = arg2; - error = 0; - break; - case PR_GET_PDEATHSIG: - error = put_user(me->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); - break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(me->mm, arg2); - error = 0; - break; - - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(me, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(me, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(me, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(me, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(me, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(me, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - else - error = 0; - break; - - case PR_SET_NAME: - comm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(comm, (char __user *)arg2, - sizeof(me->comm) - 1) < 0) - return -EFAULT; - set_task_comm(me, comm); - proc_comm_connector(me); - return 0; - case PR_GET_NAME: - get_task_comm(comm, me); - if (copy_to_user((char __user *)arg2, comm, - sizeof(comm))) - return -EFAULT; - return 0; - case PR_GET_ENDIAN: - error = GET_ENDIAN(me, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(me, arg2); - break; - - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - case PR_TASK_PERF_EVENTS_DISABLE: - error = perf_event_task_disable(); - break; - case PR_TASK_PERF_EVENTS_ENABLE: - error = perf_event_task_enable(); - break; - case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; - break; - case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = - current->default_timer_slack_ns; - else - current->timer_slack_ns = arg2; - error = 0; - break; - case PR_MCE_KILL: - if (arg4 | arg5) - return -EINVAL; - switch (arg2) { - case PR_MCE_KILL_CLEAR: - if (arg3 != 0) - return -EINVAL; - current->flags &= ~PF_MCE_PROCESS; - break; - case PR_MCE_KILL_SET: - current->flags |= PF_MCE_PROCESS; - if (arg3 == PR_MCE_KILL_EARLY) - current->flags |= PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_LATE) - current->flags &= ~PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_DEFAULT) - current->flags &= - ~(PF_MCE_EARLY|PF_MCE_PROCESS); - else - return -EINVAL; - break; - default: - return -EINVAL; - } - error = 0; - break; - case PR_MCE_KILL_GET: - if (arg2 | arg3 | arg4 | arg5) - return -EINVAL; - if (current->flags & PF_MCE_PROCESS) - error = (current->flags & PF_MCE_EARLY) ? - PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; - else - error = PR_MCE_KILL_DEFAULT; - break; - case PR_SET_MM: - error = prctl_set_mm(arg2, arg3, arg4, arg5); - break; - default: - error = -EINVAL; - break; - } - return error; -} - -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) -{ - int err = 0; - int cpu = raw_smp_processor_id(); - if (cpup) - err |= put_user(cpu, cpup); - if (nodep) - err |= put_user(cpu_to_node(cpu), nodep); - return err ? -EFAULT : 0; -} - -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static void argv_cleanup(struct subprocess_info *info) -{ - argv_free(info->argv); -} - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret = -ENOMEM; - struct subprocess_info *info; - - if (argv == NULL) { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - goto out; - } - - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); - - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); - - out: - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - -#include -#include - -#include - -/* we can't #include here, - but tell gcc to not warn with -Wmissing-prototypes */ -asmlinkage long sys_ni_syscall(void); - -/* - * Non-implemented system calls get redirected here. - */ -asmlinkage long sys_ni_syscall(void) -{ - return -ENOSYS; -} - -cond_syscall(sys_quotactl); -cond_syscall(sys32_quotactl); -cond_syscall(sys_acct); -cond_syscall(sys_lookup_dcookie); -cond_syscall(sys_swapon); -cond_syscall(sys_swapoff); -cond_syscall(sys_kexec_load); -cond_syscall(compat_sys_kexec_load); -cond_syscall(sys_init_module); -cond_syscall(sys_delete_module); -cond_syscall(sys_socketpair); -cond_syscall(sys_bind); -cond_syscall(sys_listen); -cond_syscall(sys_accept); -cond_syscall(sys_accept4); -cond_syscall(sys_connect); -cond_syscall(sys_getsockname); -cond_syscall(sys_getpeername); -cond_syscall(sys_sendto); -cond_syscall(sys_send); -cond_syscall(sys_recvfrom); -cond_syscall(sys_recv); -cond_syscall(sys_socket); -cond_syscall(sys_setsockopt); -cond_syscall(compat_sys_setsockopt); -cond_syscall(sys_getsockopt); -cond_syscall(compat_sys_getsockopt); -cond_syscall(sys_shutdown); -cond_syscall(sys_sendmsg); -cond_syscall(sys_sendmmsg); -cond_syscall(compat_sys_sendmsg); -cond_syscall(compat_sys_sendmmsg); -cond_syscall(sys_recvmsg); -cond_syscall(sys_recvmmsg); -cond_syscall(compat_sys_recvmsg); -cond_syscall(compat_sys_recv); -cond_syscall(compat_sys_recvfrom); -cond_syscall(compat_sys_recvmmsg); -cond_syscall(sys_socketcall); -cond_syscall(sys_futex); -cond_syscall(compat_sys_futex); -cond_syscall(sys_set_robust_list); -cond_syscall(compat_sys_set_robust_list); -cond_syscall(sys_get_robust_list); -cond_syscall(compat_sys_get_robust_list); -cond_syscall(sys_epoll_create); -cond_syscall(sys_epoll_create1); -cond_syscall(sys_epoll_ctl); -cond_syscall(sys_epoll_wait); -cond_syscall(sys_epoll_pwait); -cond_syscall(compat_sys_epoll_pwait); -cond_syscall(sys_semget); -cond_syscall(sys_semop); -cond_syscall(sys_semtimedop); -cond_syscall(compat_sys_semtimedop); -cond_syscall(sys_semctl); -cond_syscall(compat_sys_semctl); -cond_syscall(sys_msgget); -cond_syscall(sys_msgsnd); -cond_syscall(compat_sys_msgsnd); -cond_syscall(sys_msgrcv); -cond_syscall(compat_sys_msgrcv); -cond_syscall(sys_msgctl); -cond_syscall(compat_sys_msgctl); -cond_syscall(sys_shmget); -cond_syscall(sys_shmat); -cond_syscall(compat_sys_shmat); -cond_syscall(sys_shmdt); -cond_syscall(sys_shmctl); -cond_syscall(compat_sys_shmctl); -cond_syscall(sys_mq_open); -cond_syscall(sys_mq_unlink); -cond_syscall(sys_mq_timedsend); -cond_syscall(sys_mq_timedreceive); -cond_syscall(sys_mq_notify); -cond_syscall(sys_mq_getsetattr); -cond_syscall(compat_sys_mq_open); -cond_syscall(compat_sys_mq_timedsend); -cond_syscall(compat_sys_mq_timedreceive); -cond_syscall(compat_sys_mq_notify); -cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_mbind); -cond_syscall(sys_get_mempolicy); -cond_syscall(sys_set_mempolicy); -cond_syscall(compat_sys_mbind); -cond_syscall(compat_sys_get_mempolicy); -cond_syscall(compat_sys_set_mempolicy); -cond_syscall(sys_add_key); -cond_syscall(sys_request_key); -cond_syscall(sys_keyctl); -cond_syscall(compat_sys_keyctl); -cond_syscall(compat_sys_socketcall); -cond_syscall(sys_inotify_init); -cond_syscall(sys_inotify_init1); -cond_syscall(sys_inotify_add_watch); -cond_syscall(sys_inotify_rm_watch); -cond_syscall(sys_migrate_pages); -cond_syscall(sys_move_pages); -cond_syscall(sys_chown16); -cond_syscall(sys_fchown16); -cond_syscall(sys_getegid16); -cond_syscall(sys_geteuid16); -cond_syscall(sys_getgid16); -cond_syscall(sys_getgroups16); -cond_syscall(sys_getresgid16); -cond_syscall(sys_getresuid16); -cond_syscall(sys_getuid16); -cond_syscall(sys_lchown16); -cond_syscall(sys_setfsgid16); -cond_syscall(sys_setfsuid16); -cond_syscall(sys_setgid16); -cond_syscall(sys_setgroups16); -cond_syscall(sys_setregid16); -cond_syscall(sys_setresgid16); -cond_syscall(sys_setresuid16); -cond_syscall(sys_setreuid16); -cond_syscall(sys_setuid16); -cond_syscall(sys_vm86old); -cond_syscall(sys_vm86); -cond_syscall(sys_ipc); -cond_syscall(compat_sys_ipc); -cond_syscall(compat_sys_sysctl); -cond_syscall(sys_flock); -cond_syscall(sys_io_setup); -cond_syscall(sys_io_destroy); -cond_syscall(sys_io_submit); -cond_syscall(sys_io_cancel); -cond_syscall(sys_io_getevents); -cond_syscall(sys_syslog); -cond_syscall(sys_process_vm_readv); -cond_syscall(sys_process_vm_writev); -cond_syscall(compat_sys_process_vm_readv); -cond_syscall(compat_sys_process_vm_writev); - -/* arch-specific weak syscall entries */ -cond_syscall(sys_pciconfig_read); -cond_syscall(sys_pciconfig_write); -cond_syscall(sys_pciconfig_iobase); -cond_syscall(sys32_ipc); -cond_syscall(ppc_rtas); -cond_syscall(sys_spu_run); -cond_syscall(sys_spu_create); -cond_syscall(sys_subpage_prot); - -/* mmu depending weak syscall entries */ -cond_syscall(sys_mprotect); -cond_syscall(sys_msync); -cond_syscall(sys_mlock); -cond_syscall(sys_munlock); -cond_syscall(sys_mlockall); -cond_syscall(sys_munlockall); -cond_syscall(sys_mincore); -cond_syscall(sys_madvise); -cond_syscall(sys_mremap); -cond_syscall(sys_remap_file_pages); -cond_syscall(compat_sys_move_pages); -cond_syscall(compat_sys_migrate_pages); - -/* block-layer dependent */ -cond_syscall(sys_bdflush); -cond_syscall(sys_ioprio_set); -cond_syscall(sys_ioprio_get); - -/* New file descriptors */ -cond_syscall(sys_signalfd); -cond_syscall(sys_signalfd4); -cond_syscall(compat_sys_signalfd); -cond_syscall(compat_sys_signalfd4); -cond_syscall(sys_timerfd_create); -cond_syscall(sys_timerfd_settime); -cond_syscall(sys_timerfd_gettime); -cond_syscall(compat_sys_timerfd_settime); -cond_syscall(compat_sys_timerfd_gettime); -cond_syscall(sys_eventfd); -cond_syscall(sys_eventfd2); - -/* performance counters: */ -cond_syscall(sys_perf_event_open); - -/* fanotify! */ -cond_syscall(sys_fanotify_init); -cond_syscall(sys_fanotify_mark); - -/* open by handle */ -cond_syscall(sys_name_to_handle_at); -cond_syscall(sys_open_by_handle_at); -cond_syscall(compat_sys_open_by_handle_at); -/* - * sysctl.c: General linux system control interface - * - * Begun 24 March 1995, Stephen Tweedie - * Added /proc support, Dec 1995 - * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. - * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. - * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. - * Dynamic registration fixes, Stephen Tweedie. - * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. - * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris - * Horn. - * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. - * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. - * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill - * Wendling. - * The list_for_each() macro wasn't appropriate for the sysctl loop. - * Removed it and replaced it with older style, 03/23/00, Bill Wendling - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_X86 -#include -#include -#include -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT -#include -#endif -#ifdef CONFIG_RT_MUTEXES -#include -#endif -#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) -#include -#endif -#ifdef CONFIG_CHR_DEV_SG -#include -#endif - -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif - - -#if defined(CONFIG_SYSCTL) - -/* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern int max_threads; -extern int core_uses_pid; -extern int suid_dumpable; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; -extern int pid_max; -extern int min_free_kbytes; -extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; -extern int percpu_pagelist_fraction; -extern int compat_log; -extern int latencytop_enabled; -extern int sysctl_nr_open_min, sysctl_nr_open_max; -#ifndef CONFIG_MMU -extern int sysctl_nr_trim_pages; -#endif -#ifdef CONFIG_BLOCK -extern int blk_iopoll_enabled; -#endif - -/* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -static int neg_one = -1; -#endif - -static int zero; -static int __maybe_unused one = 1; -static int __maybe_unused two = 2; -static int __maybe_unused three = 3; -static unsigned long one_ul = 1; -static int one_hundred = 100; -#ifdef CONFIG_PRINTK -static int ten_thousand = 10000; -#endif - -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; - -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; -static int min_percpu_pagelist_fract = 8; - -static int ngroups_max = NGROUPS_MAX; -static const int cap_last_cap = CAP_LAST_CAP; - -#ifdef CONFIG_INOTIFY_USER -#include -#endif -#ifdef CONFIG_SPARC -#include -#endif - -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - -#ifdef __hppa__ -extern int pwrsw_enabled; -extern int unaligned_enabled; -#endif - -#ifdef CONFIG_IA64 -extern int no_unaligned_warning; -extern int unaligned_dump_stack; -#endif - -#ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -#ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif - -#ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; - -static int sysrq_sysctl_handler(ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int error; - - error = proc_dointvec(table, write, buffer, lenp, ppos); - if (error) - return error; - - if (write) - sysrq_toggle_support(__sysrq_enabled); - - return 0; -} - -#endif - -static struct ctl_table root_table[]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - -static struct ctl_table kern_table[]; -static struct ctl_table vm_table[]; -static struct ctl_table fs_table[]; -static struct ctl_table debug_table[]; -static struct ctl_table dev_table[]; -extern struct ctl_table random_table[]; -#ifdef CONFIG_EPOLL -extern struct ctl_table epoll_table[]; -#endif - -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT -int sysctl_legacy_va_layout; -#endif - -/* The default sysctl tables: */ - -static struct ctl_table root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; - -#ifdef CONFIG_SCHED_DEBUG -static int min_sched_granularity_ns = 100000; /* 100 usecs */ -static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_wakeup_granularity_ns; /* 0 usecs */ -static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; -static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif - -#ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; -#endif - -static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHED_DEBUG - { - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_latency_ns", - .data = &sysctl_sched_latency, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .procname = "sched_wakeup_granularity_ns", - .data = &sysctl_sched_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .procname = "sched_tunable_scaling", - .data = &sysctl_sched_tunable_scaling, - .maxlen = sizeof(enum sched_tunable_scaling), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_tunable_scaling, - .extra2 = &max_sched_tunable_scaling, - }, - { - .procname = "sched_migration_cost", - .data = &sysctl_sched_migration_cost, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_nr_migrate", - .data = &sysctl_sched_nr_migrate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_time_avg", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_shares_window", - .data = &sysctl_sched_shares_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif - { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, -#endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef __hppa__ - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, - }, -#endif -#ifdef CONFIG_HOTPLUG - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = &__sysrq_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, -#endif - { - .procname = "threads-max", - .data = &max_threads, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &ten_thousand, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dmesg_restrict, - .extra1 = &zero, - .extra2 = &two, - }, -#endif - { - .procname = "ngroups_max", - .data = &ngroups_max, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "cap_last_cap", - .data = (void *)&cap_last_cap, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &neg_one, - .extra2 = &sixty, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "nmi_watchdog", - .data = &watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dowatchdog, - .extra1 = &zero, - .extra2 = &one, - }, -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_MMU) - { - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_IA64 - { - .procname = "ignore-unaligned-usertrap", - .data = &no_unaligned_warning, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "unaligned-dump-stack", - .data = &unaligned_dump_stack, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DETECT_HUNG_TASK - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif -#ifdef CONFIG_COMPAT - { - .procname = "compat-log", - .data = &compat_log, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif -#ifdef CONFIG_RCU_TORTURE_TEST - { - .procname = "rcutorture_runnable", - .data = &rcutorture_runnable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_proc_update_handler, - }, -#endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_BLOCK - { - .procname = "blk_iopoll", - .data = &blk_iopoll_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { } -}; - -static struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &two, - }, - { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &two, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = hugetlb_treat_movable_handler, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, - }, -#endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = &one, - .extra2 = &three, - }, -#ifdef CONFIG_COMPACTION - { - .procname = "compact_memory", - .data = &sysctl_compact_memory, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = sysctl_compaction_handler, - }, - { - .procname = "extfrag_threshold", - .data = &sysctl_extfrag_threshold, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_extfrag_handler, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, - }, - -#endif /* CONFIG_COMPACTION */ - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = &zero, - }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, -#endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = &zero, - }, - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = &zero, - }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = &zero, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = &zero, - }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = &zero, - .extra2 = &one_hundred, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = &zero, - }, -#endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif - { - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = scan_unevictable_handler, - }, -#ifdef CONFIG_MEMORY_FAILURE - { - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, -#endif - { } -}; - -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) -static struct ctl_table binfmt_misc_table[] = { - { } -}; -#endif - -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(int), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(int), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(int), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = binfmt_misc_table, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &pipe_proc_fn, - .extra1 = &pipe_min_size, - }, - { } -}; - -static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = &zero, - .extra2 = &one, - }, -#endif - { } -}; - -static struct ctl_table dev_table[] = { - { } -}; - -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current_euid()) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int mode; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - -static __init int sysctl_init(void) -{ - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - sysctl_check_table(current->nsproxy, root_table); -#endif - return 0; -} - -core_initcall(sysctl_init); - -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - -#endif /* CONFIG_SYSCTL */ - -/* - * /proc/sys support - */ - -#ifdef CONFIG_PROC_SYSCTL - -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - size_t len; - char __user *p; - char c; - - if (!data || !maxlen || !*lenp) { - *lenp = 0; - return 0; - } - - if (write) { - len = 0; - p = buffer; - while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; - if (c == 0 || c == '\n') - break; - len++; - } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; - ((char *) data)[len] = 0; - *ppos += *lenp; - } else { - len = strlen(data); - if (len > maxlen) - len = maxlen; - - if (*ppos > len) { - *lenp = 0; - return 0; - } - - data += *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - if(copy_to_user(buffer, data, len)) - return -EFAULT; - if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) - return -EFAULT; - len++; - } - *lenp = len; - *ppos += len; - } - return 0; -} - -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); -} - -static size_t proc_skip_spaces(char **buf) -{ - size_t ret; - char *tmp = skip_spaces(*buf); - ret = tmp - *buf; - *buf = tmp; - return ret; -} - -static void proc_skip_char(char **buf, size_t *size, const char v) -{ - while (*size) { - if (**buf != v) - break; - (*size)--; - (*buf)++; - } -} - -#define TMPBUFLEN 22 -/** - * proc_get_long - reads an ASCII formatted integer from a user buffer - * - * @buf: a kernel buffer - * @size: size of the kernel buffer - * @val: this is where the number will be stored - * @neg: set to %TRUE if number is negative - * @perm_tr: a vector which contains the allowed trailers - * @perm_tr_len: size of the perm_tr vector - * @tr: pointer to store the trailer character - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes read. If @tr is non-NULL and a trailing - * character exists (size is non-zero after returning from this - * function), @tr is updated with the trailing character. - */ -static int proc_get_long(char **buf, size_t *size, - unsigned long *val, bool *neg, - const char *perm_tr, unsigned perm_tr_len, char *tr) -{ - int len; - char *p, tmp[TMPBUFLEN]; - - if (!*size) - return -EINVAL; - - len = *size; - if (len > TMPBUFLEN - 1) - len = TMPBUFLEN - 1; - - memcpy(tmp, *buf, len); - - tmp[len] = 0; - p = tmp; - if (*p == '-' && *size > 1) { - *neg = true; - p++; - } else - *neg = false; - if (!isdigit(*p)) - return -EINVAL; - - *val = simple_strtoul(p, &p, 0); - - len = p - tmp; - - /* We don't know if the next char is whitespace thus we may accept - * invalid integers (e.g. 1234...a) or two integers instead of one - * (e.g. 123...1). So lets not allow such large numbers. */ - if (len == TMPBUFLEN - 1) - return -EINVAL; - - if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) - return -EINVAL; - - if (tr && (len < *size)) - *tr = *p; - - *buf += len; - *size -= len; - - return 0; -} - -/** - * proc_put_long - converts an integer to a decimal ASCII formatted string - * - * @buf: the user buffer - * @size: the size of the user buffer - * @val: the integer to be converted - * @neg: sign of the number, %TRUE for negative - * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. - */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) -{ - int len; - char tmp[TMPBUFLEN], *p = tmp; - - sprintf(p, "%s%lu", neg ? "-" : "", val); - len = strlen(tmp); - if (len > *size) - len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; - *size -= len; - *buf += len; - return 0; -} -#undef TMPBUFLEN - -static int proc_put_char(void __user **buf, size_t *size, char c) -{ - if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; - *buf = *buffer; - } - return 0; -} - -static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - *valp = *negp ? -*lvalp : *lvalp; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = (unsigned long)-val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; - -static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - int *i, vleft, first = 1, err = 0; - unsigned long page = 0; - size_t left; - char *kbuf; - - if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (int *) tbl_data; - vleft = table->maxlen / sizeof(*i); - left = *lenp; - - if (!conv) - conv = do_proc_dointvec_conv; - - if (write) { - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; - } - - for (; left && vleft--; i++, first=0) { - unsigned long lval; - bool neg; - - if (write) { - left -= proc_skip_spaces(&kbuf); - - if (!left) - break; - err = proc_get_long(&kbuf, &left, &lval, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (conv(&neg, &lval, i, 1, data)) { - err = -EINVAL; - break; - } - } else { - if (conv(&neg, &lval, i, 0, data)) { - err = -EINVAL; - break; - } - if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: - if (write) { - free_page(page); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; - *ppos += *lenp; - return err; -} - -static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(bool *negp, unsigned long *lvalp, int *valp, - int write, void *data), - void *data) -{ - return __do_proc_dointvec(table->data, table, write, - buffer, lenp, ppos, conv, data); -} - -/** - * proc_dointvec - read a vector of integers - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * Returns 0 on success. - */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); -} - -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - int i; - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) - add_taint(i); - } - } - - return err; -} - -#ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - -struct do_proc_dointvec_minmax_conv_param { - int *min; - int *max; -}; - -static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - struct do_proc_dointvec_minmax_conv_param *param = data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) - return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = (unsigned long)-val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } - } - return 0; -} - -/** - * proc_dointvec_minmax - read a vector of integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct do_proc_dointvec_minmax_conv_param param = { - .min = (int *) table->extra1, - .max = (int *) table->extra2, - }; - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_minmax_conv, ¶m); -} - -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - unsigned long *i, *min, *max; - int vleft, first = 1, err = 0; - unsigned long page = 0; - size_t left; - char *kbuf; - - if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; - vleft = table->maxlen / sizeof(unsigned long); - left = *lenp; - - if (write) { - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; - } - - for (; left && vleft--; i++, first = 0) { - unsigned long val; - - if (write) { - bool neg; - - left -= proc_skip_spaces(&kbuf); - - err = proc_get_long(&kbuf, &left, &val, &neg, - proc_wspace_sep, - sizeof(proc_wspace_sep), NULL); - if (err) - break; - if (neg) - continue; - if ((min && val < *min) || (max && val > *max)) - continue; - *i = val; - } else { - val = convdiv * (*i) / convmul; - if (!first) - err = proc_put_char(&buffer, &left, '\t'); - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; - } - } - - if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); - if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: - if (write) { - free_page(page); - if (first) - return err ? : -EINVAL; - } - *lenp -= left; - *ppos += *lenp; - return err; -} - -static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) -{ - return __do_proc_doulongvec_minmax(table->data, table, write, - buffer, lenp, ppos, convmul, convdiv); -} - -/** - * proc_doulongvec_minmax - read a vector of long integers with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); -} - -/** - * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long - * values from/to the user buffer, treated as an ASCII string. The values - * are treated as milliseconds, and converted to jiffies when they are stored. - * - * This routine will ensure the values are within the range specified by - * table->extra1 (min) and table->extra2 (max). - * - * Returns 0 on success. - */ -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return do_proc_doulongvec_minmax(table, write, buffer, - lenp, ppos, HZ, 1000l); -} - - -static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (*lvalp > LONG_MAX / HZ) - return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = (unsigned long)-val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = lval / HZ; - } - return 0; -} - -static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) - return 1; - *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = (unsigned long)-val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_clock_t(lval); - } - return 0; -} - -static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - if (write) { - *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); - } else { - int val = *valp; - unsigned long lval; - if (val < 0) { - *negp = true; - lval = (unsigned long)-val; - } else { - *negp = false; - lval = (unsigned long)val; - } - *lvalp = jiffies_to_msecs(lval); - } - return 0; -} - -/** - * proc_dointvec_jiffies - read a vector of integers as seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in seconds, and are converted into - * jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_jiffies_conv,NULL); -} - -/** - * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: pointer to the file position - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); -} - -/** - * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * @ppos: the current position in the file - * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and - * are converted into jiffies. - * - * Returns 0 on success. - */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_dointvec_ms_jiffies_conv, NULL); -} - -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - -/** - * proc_do_large_bitmap - read/write from/to a large bitmap - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * The bitmap is stored at table->data and the bitmap length (in bits) - * in table->maxlen. - * - * We use a range comma separated format (e.g. 1,3-4,10-10) so that - * large bitmaps may be represented in a compact manner. Writing into - * the file will clear the bitmap then update it with the given input. - * - * Returns 0 on success. - */ -int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int err = 0; - bool first = 1; - size_t left = *lenp; - unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; - unsigned long *tmp_bitmap = NULL; - char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - - if (!bitmap_len || !left || (*ppos && !write)) { - *lenp = 0; - return 0; - } - - if (write) { - unsigned long page = 0; - char *kbuf; - - if (left > PAGE_SIZE - 1) - left = PAGE_SIZE - 1; - - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; - - tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), - GFP_KERNEL); - if (!tmp_bitmap) { - free_page(page); - return -ENOMEM; - } - proc_skip_char(&kbuf, &left, '\n'); - while (!err && left) { - unsigned long val_a, val_b; - bool neg; - - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, - sizeof(tr_a), &c); - if (err) - break; - if (val_a >= bitmap_len || neg) { - err = -EINVAL; - break; - } - - val_b = val_a; - if (left) { - kbuf++; - left--; - } - - if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, - &neg, tr_b, sizeof(tr_b), - &c); - if (err) - break; - if (val_b >= bitmap_len || neg || - val_a > val_b) { - err = -EINVAL; - break; - } - if (left) { - kbuf++; - left--; - } - } - - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - - first = 0; - proc_skip_char(&kbuf, &left, '\n'); - } - free_page(page); - } else { - unsigned long bit_a, bit_b = 0; - - while (left) { - bit_a = find_next_bit(bitmap, bitmap_len, bit_b); - if (bit_a >= bitmap_len) - break; - bit_b = find_next_zero_bit(bitmap, bitmap_len, - bit_a + 1) - 1; - - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; - if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; - } - - first = 0; bit_b++; - } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); - } - - if (!err) { - if (write) { - if (*ppos) - bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); - else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); - } - kfree(tmp_bitmap); - *lenp -= left; - *ppos += *lenp; - return 0; - } else { - kfree(tmp_bitmap); - return err; - } -} - -#else /* CONFIG_PROC_SYSCTL */ - -int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - - -#endif /* CONFIG_PROC_SYSCTL */ - -/* - * No sense putting this after each symbol definition, twice, - * exception granted :-) - */ -EXPORT_SYMBOL(proc_dointvec); -EXPORT_SYMBOL(proc_dointvec_jiffies); -EXPORT_SYMBOL(proc_dointvec_minmax); -EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); -EXPORT_SYMBOL(proc_dointvec_ms_jiffies); -EXPORT_SYMBOL(proc_dostring); -EXPORT_SYMBOL(proc_doulongvec_minmax); -EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_SYSCTL_SYSCALL - -struct bin_table; -typedef ssize_t bin_convert_t(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); - -static bin_convert_t bin_dir; -static bin_convert_t bin_string; -static bin_convert_t bin_intvec; -static bin_convert_t bin_ulongvec; -static bin_convert_t bin_uuid; -static bin_convert_t bin_dn_node_address; - -#define CTL_DIR bin_dir -#define CTL_STR bin_string -#define CTL_INT bin_intvec -#define CTL_ULONG bin_ulongvec -#define CTL_UUID bin_uuid -#define CTL_DNADR bin_dn_node_address - -#define BUFSZ 256 - -struct bin_table { - bin_convert_t *convert; - int ctl_name; - const char *procname; - const struct bin_table *child; -}; - -static const struct bin_table bin_random_table[] = { - { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, - { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, - { CTL_UUID, RANDOM_UUID, "uuid" }, - {} -}; - -static const struct bin_table bin_pty_table[] = { - { CTL_INT, PTY_MAX, "max" }, - { CTL_INT, PTY_NR, "nr" }, - {} -}; - -static const struct bin_table bin_kern_table[] = { - { CTL_STR, KERN_OSTYPE, "ostype" }, - { CTL_STR, KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { CTL_STR, KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { CTL_STR, KERN_NODENAME, "hostname" }, - { CTL_STR, KERN_DOMAINNAME, "domainname" }, - - { CTL_INT, KERN_PANIC, "panic" }, - { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, - - { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, - { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, - { CTL_INT, KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { CTL_STR, KERN_MODPROBE, "modprobe" }, - { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, - { CTL_INT, KERN_ACCT, "acct" }, - /* KERN_PPC_L2CR "l2cr" no longer used */ - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { CTL_ULONG, KERN_SHMMAX, "shmmax" }, - { CTL_INT, KERN_MSGMAX, "msgmax" }, - { CTL_INT, KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { CTL_INT, KERN_SYSRQ, "sysrq" }, - { CTL_INT, KERN_MAX_THREADS, "threads-max" }, - { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, - { CTL_ULONG, KERN_SHMALL, "shmall" }, - { CTL_INT, KERN_MSGMNI, "msgmni" }, - { CTL_INT, KERN_SEM, "sem" }, - { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, - { CTL_INT, KERN_SHMMNI, "shmmni" }, - - { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, - { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, - - { CTL_STR, KERN_HOTPLUG, "hotplug", }, - { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, - /* KERN_TAINTED "tainted" no longer used */ - { CTL_INT, KERN_CADPID, "cad_pid" }, - { CTL_INT, KERN_PIDMAX, "pid_max" }, - { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, - { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, - { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, - { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, - { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - /* KERN_HZ_TIMER "hz_timer" no longer used */ - { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, - - { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, - /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ - { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, - { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - {} -}; - -static const struct bin_table bin_vm_table[] = { - { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, - { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, - /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ - /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, - { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ - { CTL_INT, VM_SWAPPINESS, "swappiness" }, - { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, - { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, - { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, - { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, - { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, - { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, - { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct bin_table bin_net_core_table[] = { - { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, - { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, - { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, - { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, - { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, - { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, - { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, - { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { CTL_INT, NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct bin_table bin_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct bin_table bin_net_ipv4_route_table[] = { - { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, - /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ - /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_vars_table[] = { - { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, - { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - - { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, - {} -}; - -static const struct bin_table bin_net_neigh_vars_table[] = { - { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, - /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ - { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ - /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ - /* NET_NEIGH_LOCKTIME "locktime" no longer used */ - { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct bin_table bin_net_neigh_table[] = { - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, - { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, - {} -}; - -static const struct bin_table bin_net_ipv4_netfilter_table[] = { - { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ - - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ - /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct bin_table bin_net_ipv4_table[] = { - {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, - - { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, - { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, - - { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, - { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, - { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, - { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { CTL_INT, NET_TCP_FACK, "tcp_fack" }, - { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, - { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, - { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, - { CTL_INT, NET_TCP_MEM, "tcp_mem" }, - { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, - { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, - { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, - { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, - { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_ABC, "tcp_abc" }, - { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, - { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, - { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ - { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - - { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - - { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ - - { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - /* NET_TCP_BIC_BETA unused */ - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - /* NET_IPV4_IP_MASQ_DEBUG unused */ - /* NET_TCP_SYN_TAILDROP unused */ - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - /* NET_IPV4_ALWAYS_DEFRAG unused */ - {} -}; - -static const struct bin_table bin_net_ipx_table[] = { - { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct bin_table bin_net_atalk_table[] = { - { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct bin_table bin_net_netrom_table[] = { - { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { CTL_INT, NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct bin_table bin_net_ax25_param_table[] = { - { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, - { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, - { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, - { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, - { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct bin_table bin_net_ax25_table[] = { - { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, - {} -}; - -static const struct bin_table bin_net_rose_table[] = { - { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, - { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_var_table[] = { - { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, - { CTL_INT, NET_IPV6_MTU, "mtu" }, - { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, - { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, - { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, - { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, - {} -}; - -static const struct bin_table bin_net_ipv6_route_table[] = { - /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ - { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct bin_table bin_net_ipv6_icmp_table[] = { - { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct bin_table bin_net_ipv6_table[] = { - { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, - { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, - { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, - { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, - { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct bin_table bin_net_x25_table[] = { - { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { CTL_INT, NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct bin_table bin_net_tr_table[] = { - { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct bin_table bin_net_decnet_conf_vars[] = { - { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, - { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct bin_table bin_net_decnet_conf[] = { - { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, - { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, - {} -}; - -static const struct bin_table bin_net_decnet_table[] = { - { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, - { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, - { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, - { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, - { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, - { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, - { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, - { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, - { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, - { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, - { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct bin_table bin_net_sctp_table[] = { - { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, - { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, - { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, - { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, - { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, - { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_timeout_table[] = { - { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, - { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, - { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, - { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct bin_table bin_net_llc_station_table[] = { - { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_table[] = { - { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, - {} -}; - -static const struct bin_table bin_net_llc_table[] = { - { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, - { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, - {} -}; - -static const struct bin_table bin_net_netfilter_table[] = { - { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ - /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ - /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ - /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct bin_table bin_net_irda_table[] = { - { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, - { CTL_STR, NET_IRDA_DEVNAME, "devname" }, - { CTL_INT, NET_IRDA_DEBUG, "debug" }, - { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - -static const struct bin_table bin_net_table[] = { - { CTL_DIR, NET_CORE, "core", bin_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, - { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, - { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, - { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, - { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, - { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, - /* NET_BRIDGE "bridge" no longer used */ - { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, - { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, - { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, - { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, - { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, - /* NET_ECONET not used */ - { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, - { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, - { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, - /* NET_DCCP "dccp" no longer used */ - { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, - { CTL_INT, 2089, "nf_conntrack_max" }, - {} -}; - -static const struct bin_table bin_fs_quota_table[] = { - { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, - { CTL_INT, FS_DQ_DROPS, "drops" }, - { CTL_INT, FS_DQ_READS, "reads" }, - { CTL_INT, FS_DQ_WRITES, "writes" }, - { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, - { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, - { CTL_INT, FS_DQ_FREE, "free_dquots" }, - { CTL_INT, FS_DQ_SYNCS, "syncs" }, - { CTL_INT, FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct bin_table bin_fs_xfs_table[] = { - { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, - - { CTL_INT, XFS_ERRLEVEL, "error_level" }, - { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, - { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, - { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, - { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, - { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, - { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_nm_table[] = { - { CTL_STR, 1, "hb_ctl_path" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_table[] = { - { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, - {} -}; - -static const struct bin_table bin_inotify_table[] = { - { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct bin_table bin_fs_table[] = { - { CTL_INT, FS_NRINODE, "inode-nr" }, - { CTL_INT, FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - /* FS_NRFILE "file-nr" no longer used */ - { CTL_INT, FS_MAXFILE, "file-max" }, - { CTL_INT, FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, - { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, - { CTL_INT, FS_LEASES, "leases-enable" }, - { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, - { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, - { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, - { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, - { CTL_ULONG, FS_AIO_NR, "aio-nr" }, - { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, - { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, - { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, - { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct bin_table bin_ipmi_table[] = { - { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct bin_table bin_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct bin_table bin_raid_table[] = { - { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct bin_table bin_scsi_table[] = { - { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct bin_table bin_dev_table[] = { - /* DEV_CDROM "cdrom" no longer used */ - /* DEV_HWMON unused */ - /* DEV_PARPORT "parport" no longer used */ - { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, - { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, - { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, - { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, - {} -}; - -static const struct bin_table bin_bus_isa_table[] = { - { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, - { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, - { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct bin_table bin_bus_table[] = { - { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, - {} -}; - - -static const struct bin_table bin_s390dbf_table[] = { - { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct bin_table bin_sunrpc_table[] = { - /* CTL_RPCDEBUG "rpc_debug" no longer used */ - /* CTL_NFSDEBUG "nfs_debug" no longer used */ - /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ - /* CTL_NLMDEBUG "nlm_debug" no longer used */ - - { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct bin_table bin_pm_table[] = { - /* frv specific */ - /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ - { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, - { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, - { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct bin_table bin_root_table[] = { - { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, - { CTL_DIR, CTL_VM, "vm", bin_vm_table }, - { CTL_DIR, CTL_NET, "net", bin_net_table }, - /* CTL_PROC not used */ - { CTL_DIR, CTL_FS, "fs", bin_fs_table }, - /* CTL_DEBUG "debug" no longer used */ - { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, - { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, - { CTL_DIR, CTL_ABI, "abi" }, - /* CTL_CPU not used */ - /* CTL_ARLAN "arlan" no longer used */ - { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, - { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, - { CTL_DIR, CTL_PM, "pm", bin_pm_table }, - {} -}; - -static ssize_t bin_dir(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOTDIR; -} - - -static ssize_t bin_string(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char __user *lastp; - loff_t pos = 0; - int ch; - - result = vfs_read(file, oldval, oldlen, &pos); - if (result < 0) - goto out; - - copied = result; - lastp = oldval + copied - 1; - - result = -EFAULT; - if (get_user(ch, lastp)) - goto out; - - /* Trim off the trailing newline */ - if (ch == '\n') { - result = -EFAULT; - if (put_user('\0', lastp)) - goto out; - copied -= 1; - } - } - - if (newval && newlen) { - loff_t pos = 0; - - result = vfs_write(file, newval, newlen, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static ssize_t bin_intvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - mm_segment_t old_fs = get_fs(); - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; - char *str, *end; - int i; - - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned __user *vec = newval; - size_t length = newlen / sizeof(*vec); - loff_t pos = 0; - char *str, *end; - int i; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += snprintf(str, end - str, "%lu\t", value); - } - - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_ulongvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - mm_segment_t old_fs = get_fs(); - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned long __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; - char *str, *end; - int i; - - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned long __user *vec = newval; - size_t length = newlen / sizeof(*vec); - loff_t pos = 0; - char *str, *end; - int i; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += snprintf(str, end - str, "%lu\t", value); - } - - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_uuid(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - mm_segment_t old_fs = get_fs(); - ssize_t result, copied = 0; - - /* Only supports reads */ - if (oldval && oldlen) { - loff_t pos = 0; - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; - - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); - if (result < 0) - goto out; - - buf[result] = '\0'; - - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } - - if (oldlen > 16) - oldlen = 16; - - result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) - goto out; - - copied = oldlen; - } - result = copied; -out: - return result; -} - -static ssize_t bin_dn_node_address(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - mm_segment_t old_fs = get_fs(); - ssize_t result, copied = 0; - - if (oldval && oldlen) { - loff_t pos = 0; - char buf[15], *nodep; - unsigned long area, node; - __le16 dnaddr; - - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); - if (result < 0) - goto out; - - buf[result] = '\0'; - - /* Convert the decnet address to binary */ - result = -EIO; - nodep = strchr(buf, '.') + 1; - if (!nodep) - goto out; - - area = simple_strtoul(buf, NULL, 10); - node = simple_strtoul(nodep, NULL, 10); - - result = -EIO; - if ((area > 63)||(node > 1023)) - goto out; - - dnaddr = cpu_to_le16((area << 10) | node); - - result = -EFAULT; - if (put_user(dnaddr, (__le16 __user *)oldval)) - goto out; - - copied = sizeof(dnaddr); - } - - if (newval && newlen) { - loff_t pos = 0; - __le16 dnaddr; - char buf[15]; - int len; - - result = -EINVAL; - if (newlen != sizeof(dnaddr)) - goto out; - - result = -EFAULT; - if (get_user(dnaddr, (__le16 __user *)newval)) - goto out; - - len = snprintf(buf, sizeof(buf), "%hu.%hu", - le16_to_cpu(dnaddr) >> 10, - le16_to_cpu(dnaddr) & 0x3ff); - - set_fs(KERNEL_DS); - result = vfs_write(file, buf, len, &pos); - set_fs(old_fs); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) -{ - const struct bin_table *table = &bin_root_table[0]; - int ctl_name; - - /* The binary sysctl tables have a small maximum depth so - * there is no danger of overflowing our path as it PATH_MAX - * bytes long. - */ - memcpy(path, "sys/", 4); - path += 4; - -repeat: - if (!nlen) - return ERR_PTR(-ENOTDIR); - ctl_name = *name; - name++; - nlen--; - for ( ; table->convert; table++) { - int len = 0; - - /* - * For a wild card entry map from ifindex to network - * device name. - */ - if (!table->ctl_name) { -#ifdef CONFIG_NET - struct net *net = current->nsproxy->net_ns; - struct net_device *dev; - dev = dev_get_by_index(net, ctl_name); - if (dev) { - len = strlen(dev->name); - memcpy(path, dev->name, len); - dev_put(dev); - } -#endif - /* Use the well known sysctl number to proc name mapping */ - } else if (ctl_name == table->ctl_name) { - len = strlen(table->procname); - memcpy(path, table->procname, len); - } - if (len) { - path += len; - if (table->child) { - *path++ = '/'; - table = table->child; - goto repeat; - } - *path = '\0'; - return table; - } - } - return ERR_PTR(-ENOTDIR); -} - -static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) -{ - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - const struct bin_table *table = get_sysctl(name, nlen, tmp); - result = tmp; - *tablep = table; - if (IS_ERR(table)) { - __putname(tmp); - result = ERR_CAST(table); - } - } - return result; -} - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - const struct bin_table *table = NULL; - struct vfsmount *mnt; - struct file *file; - ssize_t result; - char *pathname; - int flags; - - pathname = sysctl_getname(name, nlen, &table); - result = PTR_ERR(pathname); - if (IS_ERR(pathname)) - goto out; - - /* How should the sysctl be accessed? */ - if (oldval && oldlen && newval && newlen) { - flags = O_RDWR; - } else if (newval && newlen) { - flags = O_WRONLY; - } else if (oldval && oldlen) { - flags = O_RDONLY; - } else { - result = 0; - goto out_putname; - } - - mnt = current->nsproxy->pid_ns->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); - result = PTR_ERR(file); - if (IS_ERR(file)) - goto out_putname; - - result = table->convert(file, oldval, oldlen, newval, newlen); - - fput(file); -out_putname: - __putname(pathname); -out: - return result; -} - - -#else /* CONFIG_SYSCTL_SYSCALL */ - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -#endif /* CONFIG_SYSCTL_SYSCALL */ - - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk("%d.", name[i]); - printk("\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT -#include - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} -/* - * taskstats.c - Export per-task statistics to userland - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * (C) Balbir Singh, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Maximum length of a cpumask that can be specified in - * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute - */ -#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) - -static DEFINE_PER_CPU(__u32, taskstats_seqnum); -static int family_registered; -struct kmem_cache *taskstats_cache; - -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = TASKSTATS_GENL_NAME, - .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, -}; - -static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { - [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, - [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, - [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, - [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; - -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { - [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, -}; - -struct listener { - struct list_head list; - pid_t pid; - char valid; -}; - -struct listener_list { - struct rw_semaphore sem; - struct list_head list; -}; -static DEFINE_PER_CPU(struct listener_list, listener_array); - -enum actions { - REGISTER, - DEREGISTER, - CPU_DONT_CARE -}; - -static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, - size_t size) -{ - struct sk_buff *skb; - void *reply; - - /* - * If new attributes are added, please revisit this allocation - */ - skb = genlmsg_new(size, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - if (!info) { - int seq = this_cpu_inc_return(taskstats_seqnum) - 1; - - reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); - } else - reply = genlmsg_put_reply(skb, info, &family, 0, cmd); - if (reply == NULL) { - nlmsg_free(skb); - return -EINVAL; - } - - *skbp = skb; - return 0; -} - -/* - * Send taskstats data in @skb to listener with nl_pid @pid - */ -static int send_reply(struct sk_buff *skb, struct genl_info *info) -{ - struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); - void *reply = genlmsg_data(genlhdr); - int rc; - - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return rc; - } - - return genlmsg_reply(skb, info); -} - -/* - * Send taskstats data in @skb to listeners registered for @cpu's exit data - */ -static void send_cpu_listeners(struct sk_buff *skb, - struct listener_list *listeners) -{ - struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); - struct listener *s, *tmp; - struct sk_buff *skb_next, *skb_cur = skb; - void *reply = genlmsg_data(genlhdr); - int rc, delcount = 0; - - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return; - } - - rc = 0; - down_read(&listeners->sem); - list_for_each_entry(s, &listeners->list, list) { - skb_next = NULL; - if (!list_is_last(&s->list, &listeners->list)) { - skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) - break; - } - rc = genlmsg_unicast(&init_net, skb_cur, s->pid); - if (rc == -ECONNREFUSED) { - s->valid = 0; - delcount++; - } - skb_cur = skb_next; - } - up_read(&listeners->sem); - - if (skb_cur) - nlmsg_free(skb_cur); - - if (!delcount) - return; - - /* Delete invalidated entries */ - down_write(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { - if (!s->valid) { - list_del(&s->list); - kfree(s); - } - } - up_write(&listeners->sem); -} - -static void fill_stats(struct task_struct *tsk, struct taskstats *stats) -{ - memset(stats, 0, sizeof(*stats)); - /* - * Each accounting subsystem adds calls to its functions to - * fill in relevant parts of struct taskstsats as follows - * - * per-task-foo(stats, tsk); - */ - - delayacct_add_tsk(stats, tsk); - - /* fill in basic acct fields */ - stats->version = TASKSTATS_VERSION; - stats->nvcsw = tsk->nvcsw; - stats->nivcsw = tsk->nivcsw; - bacct_add_tsk(stats, tsk); - - /* fill in extended acct fields */ - xacct_add_tsk(stats, tsk); -} - -static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) -{ - struct task_struct *tsk; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return -ESRCH; - fill_stats(tsk, stats); - put_task_struct(tsk); - return 0; -} - -static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) -{ - struct task_struct *tsk, *first; - unsigned long flags; - int rc = -ESRCH; - - /* - * Add additional stats from live tasks except zombie thread group - * leaders who are already counted with the dead tasks - */ - rcu_read_lock(); - first = find_task_by_vpid(tgid); - - if (!first || !lock_task_sighand(first, &flags)) - goto out; - - if (first->signal->stats) - memcpy(stats, first->signal->stats, sizeof(*stats)); - else - memset(stats, 0, sizeof(*stats)); - - tsk = first; - do { - if (tsk->exit_state) - continue; - /* - * Accounting subsystem can call its functions here to - * fill in relevant parts of struct taskstsats as follows - * - * per-task-foo(stats, tsk); - */ - delayacct_add_tsk(stats, tsk); - - stats->nvcsw += tsk->nvcsw; - stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); - - unlock_task_sighand(first, &flags); - rc = 0; -out: - rcu_read_unlock(); - - stats->version = TASKSTATS_VERSION; - /* - * Accounting subsystems can also add calls here to modify - * fields of taskstats. - */ - return rc; -} - -static void fill_tgid_exit(struct task_struct *tsk) -{ - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - if (!tsk->signal->stats) - goto ret; - - /* - * Each accounting subsystem calls its functions here to - * accumalate its per-task stats for tsk, into the per-tgid structure - * - * per-task-foo(tsk->signal->stats, tsk); - */ - delayacct_add_tsk(tsk->signal->stats, tsk); -ret: - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); - return; -} - -static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) -{ - struct listener_list *listeners; - struct listener *s, *tmp, *s2; - unsigned int cpu; - - if (!cpumask_subset(mask, cpu_possible_mask)) - return -EINVAL; - - if (isadd == REGISTER) { - for_each_cpu(cpu, mask) { - s = kmalloc_node(sizeof(struct listener), - GFP_KERNEL, cpu_to_node(cpu)); - if (!s) - goto cleanup; - - s->pid = pid; - s->valid = 1; - - listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); - list_for_each_entry(s2, &listeners->list, list) { - if (s2->pid == pid && s2->valid) - goto exists; - } - list_add(&s->list, &listeners->list); - s = NULL; -exists: - up_write(&listeners->sem); - kfree(s); /* nop if NULL */ - } - return 0; - } - - /* Deregister or cleanup */ -cleanup: - for_each_cpu(cpu, mask) { - listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { - if (s->pid == pid) { - list_del(&s->list); - kfree(s); - break; - } - } - up_write(&listeners->sem); - } - return 0; -} - -static int parse(struct nlattr *na, struct cpumask *mask) -{ - char *data; - int len; - int ret; - - if (na == NULL) - return 1; - len = nla_len(na); - if (len > TASKSTATS_CPUMASK_MAXLEN) - return -E2BIG; - if (len < 1) - return -EINVAL; - data = kmalloc(len, GFP_KERNEL); - if (!data) - return -ENOMEM; - nla_strlcpy(data, na, len); - ret = cpulist_parse(data, mask); - kfree(data); - return ret; -} - -#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define TASKSTATS_NEEDS_PADDING 1 -#endif - -static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) -{ - struct nlattr *na, *ret; - int aggr; - - aggr = (type == TASKSTATS_TYPE_PID) - ? TASKSTATS_TYPE_AGGR_PID - : TASKSTATS_TYPE_AGGR_TGID; - - /* - * The taskstats structure is internally aligned on 8 byte - * boundaries but the layout of the aggregrate reply, with - * two NLA headers and the pid (each 4 bytes), actually - * force the entire structure to be unaligned. This causes - * the kernel to issue unaligned access warnings on some - * architectures like ia64. Unfortunately, some software out there - * doesn't properly unroll the NLA packet and assumes that the start - * of the taskstats structure will always be 20 bytes from the start - * of the netlink payload. Aligning the start of the taskstats - * structure breaks this software, which we don't want. So, for now - * the alignment only happens on architectures that require it - * and those users will have to update to fixed versions of those - * packages. Space is reserved in the packet only when needed. - * This ifdef should be removed in several years e.g. 2012 once - * we can be confident that fixed versions are installed on most - * systems. We add the padding before the aggregate since the - * aggregate is already a defined type. - */ -#ifdef TASKSTATS_NEEDS_PADDING - if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) - goto err; -#endif - na = nla_nest_start(skb, aggr); - if (!na) - goto err; - - if (nla_put(skb, type, sizeof(pid), &pid) < 0) - goto err; - ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); - if (!ret) - goto err; - nla_nest_end(skb, na); - - return nla_data(ret); -err: - return NULL; -} - -static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) -{ - int rc = 0; - struct sk_buff *rep_skb; - struct cgroupstats *stats; - struct nlattr *na; - size_t size; - u32 fd; - struct file *file; - int fput_needed; - - na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; - if (!na) - return -EINVAL; - - fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); - file = fget_light(fd, &fput_needed); - if (!file) - return 0; - - size = nla_total_size(sizeof(struct cgroupstats)); - - rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, - size); - if (rc < 0) - goto err; - - na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, - sizeof(struct cgroupstats)); - stats = nla_data(na); - memset(stats, 0, sizeof(*stats)); - - rc = cgroupstats_build(stats, file->f_dentry); - if (rc < 0) { - nlmsg_free(rep_skb); - goto err; - } - - rc = send_reply(rep_skb, info); - -err: - fput_light(file, fput_needed); - return rc; -} - -static int cmd_attr_register_cpumask(struct genl_info *info) -{ - cpumask_var_t mask; - int rc; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); - if (rc < 0) - goto out; - rc = add_del_listener(info->snd_pid, mask, REGISTER); -out: - free_cpumask_var(mask); - return rc; -} - -static int cmd_attr_deregister_cpumask(struct genl_info *info) -{ - cpumask_var_t mask; - int rc; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); - if (rc < 0) - goto out; - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); -out: - free_cpumask_var(mask); - return rc; -} - -static size_t taskstats_packet_size(void) -{ - size_t size; - - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); -#ifdef TASKSTATS_NEEDS_PADDING - size += nla_total_size(0); /* Padding for alignment */ -#endif - return size; -} - -static int cmd_attr_pid(struct genl_info *info) -{ - struct taskstats *stats; - struct sk_buff *rep_skb; - size_t size; - u32 pid; - int rc; - - size = taskstats_packet_size(); - - rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); - if (rc < 0) - return rc; - - rc = -EINVAL; - pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); - if (!stats) - goto err; - - rc = fill_stats_for_pid(pid, stats); - if (rc < 0) - goto err; - return send_reply(rep_skb, info); -err: - nlmsg_free(rep_skb); - return rc; -} - -static int cmd_attr_tgid(struct genl_info *info) -{ - struct taskstats *stats; - struct sk_buff *rep_skb; - size_t size; - u32 tgid; - int rc; - - size = taskstats_packet_size(); - - rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); - if (rc < 0) - return rc; - - rc = -EINVAL; - tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); - if (!stats) - goto err; - - rc = fill_stats_for_tgid(tgid, stats); - if (rc < 0) - goto err; - return send_reply(rep_skb, info); -err: - nlmsg_free(rep_skb); - return rc; -} - -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) -{ - if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) - return cmd_attr_register_cpumask(info); - else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) - return cmd_attr_deregister_cpumask(info); - else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) - return cmd_attr_pid(info); - else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) - return cmd_attr_tgid(info); - else - return -EINVAL; -} - -static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct taskstats *stats; - - if (sig->stats || thread_group_empty(tsk)) - goto ret; - - /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); - - spin_lock_irq(&tsk->sighand->siglock); - if (!sig->stats) { - sig->stats = stats; - stats = NULL; - } - spin_unlock_irq(&tsk->sighand->siglock); - - if (stats) - kmem_cache_free(taskstats_cache, stats); -ret: - return sig->stats; -} - -/* Send pid data out on exit */ -void taskstats_exit(struct task_struct *tsk, int group_dead) -{ - int rc; - struct listener_list *listeners; - struct taskstats *stats; - struct sk_buff *rep_skb; - size_t size; - int is_thread_group; - - if (!family_registered) - return; - - /* - * Size includes space for nested attributes - */ - size = taskstats_packet_size(); - - is_thread_group = !!taskstats_tgid_alloc(tsk); - if (is_thread_group) { - /* PID + STATS + TGID + STATS */ - size = 2 * size; - /* fill the tsk->signal->stats structure */ - fill_tgid_exit(tsk); - } - - listeners = __this_cpu_ptr(&listener_array); - if (list_empty(&listeners->list)) - return; - - rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); - if (rc < 0) - return; - - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); - if (!stats) - goto err; - - fill_stats(tsk, stats); - - /* - * Doesn't matter if tsk is the leader or the last group member leaving - */ - if (!is_thread_group || !group_dead) - goto send; - - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); - if (!stats) - goto err; - - memcpy(stats, tsk->signal->stats, sizeof(*stats)); - -send: - send_cpu_listeners(rep_skb, listeners); - return; -err: - nlmsg_free(rep_skb); -} - -static struct genl_ops taskstats_ops = { - .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, -}; - -static struct genl_ops cgroupstats_ops = { - .cmd = CGROUPSTATS_CMD_GET, - .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, -}; - -/* Needed early in initialization */ -void __init taskstats_init_early(void) -{ - unsigned int i; - - taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); - for_each_possible_cpu(i) { - INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); - init_rwsem(&(per_cpu(listener_array, i).sem)); - } -} - -static int __init taskstats_init(void) -{ - int rc; - - rc = genl_register_family(&family); - if (rc) - return rc; - - rc = genl_register_ops(&family, &taskstats_ops); - if (rc < 0) - goto err; - - rc = genl_register_ops(&family, &cgroupstats_ops); - if (rc < 0) - goto err_cgroup_ops; - - family_registered = 1; - pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); - return 0; -err_cgroup_ops: - genl_unregister_ops(&family, &taskstats_ops); -err: - genl_unregister_family(&family); - return rc; -} - -/* - * late initcall ensures initialization of statistics collection - * mechanisms precedes initialization of the taskstats interface - */ -late_initcall(taskstats_init); -/* - * test_kprobes.c - simple sanity test for *probes - * - * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ - -#include -#include -#include - -#define div_factor 3 - -static u32 rand1, preh_val, posth_val, jph_val; -static int errors, handler_errors, num_tests; -static u32 (*target)(u32 value); -static u32 (*target2)(u32 value); - -static noinline u32 kprobe_target(u32 value) -{ - return (value / div_factor); -} - -static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - preh_val = (rand1 / div_factor); - return 0; -} - -static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - if (preh_val != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); - } - posth_val = preh_val + div_factor; -} - -static struct kprobe kp = { - .symbol_name = "kprobe_target", - .pre_handler = kp_pre_handler, - .post_handler = kp_post_handler -}; - -static int test_kprobe(void) -{ - int ret; - - ret = register_kprobe(&kp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_kprobe(&kp); - - if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); - handler_errors++; - } - - return 0; -} - -static noinline u32 kprobe_target2(u32 value) -{ - return (value / div_factor) + 1; -} - -static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) -{ - preh_val = (rand1 / div_factor) + 1; - return 0; -} - -static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - if (preh_val != (rand1 / div_factor) + 1) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); - } - posth_val = preh_val + div_factor; -} - -static struct kprobe kp2 = { - .symbol_name = "kprobe_target2", - .pre_handler = kp_pre_handler2, - .post_handler = kp_post_handler2 -}; - -static int test_kprobes(void) -{ - int ret; - struct kprobe *kps[2] = {&kp, &kp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - kp.addr = NULL; - kp.flags = 0; - ret = register_kprobes(kps, 2); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); - return ret; - } - - preh_val = 0; - posth_val = 0; - ret = target(rand1); - - if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); - handler_errors++; - } - - preh_val = 0; - posth_val = 0; - ret = target2(rand1); - - if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); - handler_errors++; - } - - unregister_kprobes(kps, 2); - return 0; - -} - -static u32 j_kprobe_target(u32 value) -{ - if (value != rand1) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -static struct jprobe jp2 = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target2" -}; - -static int test_jprobes(void) -{ - int ret; - struct jprobe *jps[2] = {&jp, &jp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - jp.kp.addr = NULL; - jp.kp.flags = 0; - ret = register_jprobes(jps, 2); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); - return ret; - } - - jph_val = 0; - ret = target(rand1); - if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); - handler_errors++; - } - - jph_val = 0; - ret = target2(rand1); - if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); - handler_errors++; - } - unregister_jprobes(jps, 2); - - return 0; -} -#ifdef CONFIG_KRETPROBES -static u32 krph_val; - -static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - krph_val = (rand1 / div_factor); - return 0; -} - -static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - if (ret != (rand1 / div_factor)) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); - } - if (krph_val == 0) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); - } - - krph_val = rand1; - return 0; -} - -static struct kretprobe rp = { - .handler = return_handler, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target" -}; - -static int test_kretprobe(void) -{ - int ret; - - ret = register_kretprobe(&rp); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_kretprobe(&rp); - if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long ret = regs_return_value(regs); - - if (ret != (rand1 / div_factor) + 1) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); - } - if (krph_val == 0) { - handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); - } - - krph_val = rand1; - return 0; -} - -static struct kretprobe rp2 = { - .handler = return_handler2, - .entry_handler = entry_handler, - .kp.symbol_name = "kprobe_target2" -}; - -static int test_kretprobes(void) -{ - int ret; - struct kretprobe *rps[2] = {&rp, &rp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - rp.kp.addr = NULL; - rp.kp.flags = 0; - ret = register_kretprobes(rps, 2); - if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); - return ret; - } - - krph_val = 0; - ret = target(rand1); - if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); - handler_errors++; - } - - krph_val = 0; - ret = target2(rand1); - if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); - handler_errors++; - } - unregister_kretprobes(rps, 2); - return 0; -} -#endif /* CONFIG_KRETPROBES */ - -int init_test_probes(void) -{ - int ret; - - target = kprobe_target; - target2 = kprobe_target2; - - do { - rand1 = random32(); - } while (rand1 <= div_factor); - - printk(KERN_INFO "Kprobe smoke test started\n"); - num_tests++; - ret = test_kprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kprobes(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobes(); - if (ret < 0) - errors++; - -#ifdef CONFIG_KRETPROBES - num_tests++; - ret = test_kretprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kretprobes(); - if (ret < 0) - errors++; -#endif /* CONFIG_KRETPROBES */ - - if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); - else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); - else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); - - return 0; -} -/* - * Alarmtimer interface - * - * This interface provides a timer which is similarto hrtimers, - * but triggers a RTC alarm if the box is suspend. - * - * This interface is influenced by the Android RTC Alarm timer - * interface. - * - * Copyright (C) 2010 IBM Corperation - * - * Author: John Stultz - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base - * @timerqueue: Timerqueue head managing the list of events - * @timer: hrtimer used to schedule events while running - * @gettime: Function to read the time correlating to the base - * @base_clockid: clockid for the base - */ -static struct alarm_base { - spinlock_t lock; - struct timerqueue_head timerqueue; - struct hrtimer timer; - ktime_t (*gettime)(void); - clockid_t base_clockid; -} alarm_bases[ALARM_NUMTYPE]; - -/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ -static ktime_t freezer_delta; -static DEFINE_SPINLOCK(freezer_delta_lock); - -#ifdef CONFIG_RTC_CLASS -/* rtc timer and device for setting alarm wakeups at suspend */ -static struct rtc_timer rtctimer; -static struct rtc_device *rtcdev; -static DEFINE_SPINLOCK(rtcdev_lock); - -/** - * alarmtimer_get_rtcdev - Return selected rtcdevice - * - * This function returns the rtc device to use for wakealarms. - * If one has not already been chosen, it checks to see if a - * functional rtc device is available. - */ -static struct rtc_device *alarmtimer_get_rtcdev(void) -{ - unsigned long flags; - struct rtc_device *ret; - - spin_lock_irqsave(&rtcdev_lock, flags); - ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); - - return ret; -} - - -static int alarmtimer_rtc_add_device(struct device *dev, - struct class_interface *class_intf) -{ - unsigned long flags; - struct rtc_device *rtc = to_rtc_device(dev); - - if (rtcdev) - return -EBUSY; - - if (!rtc->ops->set_alarm) - return -1; - if (!device_may_wakeup(rtc->dev.parent)) - return -1; - - spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - } - spin_unlock_irqrestore(&rtcdev_lock, flags); - return 0; -} - -static struct class_interface alarmtimer_rtc_interface = { - .add_dev = &alarmtimer_rtc_add_device, -}; - -static int alarmtimer_rtc_interface_setup(void) -{ - alarmtimer_rtc_interface.class = rtc_class; - return class_interface_register(&alarmtimer_rtc_interface); -} -static void alarmtimer_rtc_interface_remove(void) -{ - class_interface_unregister(&alarmtimer_rtc_interface); -} -#else -static inline struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) -static inline int alarmtimer_rtc_interface_setup(void) { return 0; } -static inline void alarmtimer_rtc_interface_remove(void) { } -#endif - -/** - * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue - * @base: pointer to the base where the timer is being run - * @alarm: pointer to alarm being enqueued. - * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. - * - * Must hold base->lock when calling. - */ -static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) -{ - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - - if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { - hrtimer_try_to_cancel(&base->timer); - hrtimer_start(&base->timer, alarm->node.expires, - HRTIMER_MODE_ABS); - } -} - -/** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue - * @base: pointer to the base where the timer is running - * @alarm: pointer to alarm being removed - * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. - * - * Must hold base->lock when calling. - */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) -{ - struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); - - if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) - return; - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - if (next == &alarm->node) { - hrtimer_try_to_cancel(&base->timer); - next = timerqueue_getnext(&base->timerqueue); - if (!next) - return; - hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); - } -} - - -/** - * alarmtimer_fired - Handles alarm hrtimer being fired. - * @timer: pointer to hrtimer being run - * - * When a alarm timer fires, this runs through the timerqueue to - * see which alarms expired, and runs those. If there are more alarm - * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. - */ -static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) -{ - struct alarm_base *base = container_of(timer, struct alarm_base, timer); - struct timerqueue_node *next; - unsigned long flags; - ktime_t now; - int ret = HRTIMER_NORESTART; - int restart = ALARMTIMER_NORESTART; - - spin_lock_irqsave(&base->lock, flags); - now = base->gettime(); - while ((next = timerqueue_getnext(&base->timerqueue))) { - struct alarm *alarm; - ktime_t expired = next->expires; - - if (expired.tv64 > now.tv64) - break; - - alarm = container_of(next, struct alarm, node); - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - alarm->state |= ALARMTIMER_STATE_CALLBACK; - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - restart = alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); - alarm->state &= ~ALARMTIMER_STATE_CALLBACK; - - if (restart != ALARMTIMER_NORESTART) { - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - } - } - - if (next) { - hrtimer_set_expires(&base->timer, next->expires); - ret = HRTIMER_RESTART; - } - spin_unlock_irqrestore(&base->lock, flags); - - return ret; - -} - -#ifdef CONFIG_RTC_CLASS -/** - * alarmtimer_suspend - Suspend time callback - * @dev: unused - * @state: unused - * - * When we are going into suspend, we look through the bases - * to see which is the soonest timer to expire. We then - * set an rtc timer to fire that far into the future, which - * will wake us from suspend. - */ -static int alarmtimer_suspend(struct device *dev) -{ - struct rtc_time tm; - ktime_t min, now; - unsigned long flags; - struct rtc_device *rtc; - int i; - - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - freezer_delta = ktime_set(0, 0); - spin_unlock_irqrestore(&freezer_delta_lock, flags); - - rtc = alarmtimer_get_rtcdev(); - /* If we have no rtcdev, just return */ - if (!rtc) - return 0; - - /* Find the soonest timer to expire*/ - for (i = 0; i < ALARM_NUMTYPE; i++) { - struct alarm_base *base = &alarm_bases[i]; - struct timerqueue_node *next; - ktime_t delta; - - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); - if (!next) - continue; - delta = ktime_sub(next->expires, base->gettime()); - if (!min.tv64 || (delta.tv64 < min.tv64)) - min = delta; - } - if (min.tv64 == 0) - return 0; - - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); - - /* Setup an rtc timer to fire that far in the future */ - rtc_timer_cancel(rtc, &rtctimer); - rtc_read_time(rtc, &tm); - now = rtc_tm_to_ktime(tm); - now = ktime_add(now, min); - - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; -} -#else -static int alarmtimer_suspend(struct device *dev) -{ - return 0; -} -#endif - -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) -{ - ktime_t delta; - unsigned long flags; - struct alarm_base *base = &alarm_bases[type]; - - delta = ktime_sub(absexp, base->gettime()); - - spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) - freezer_delta = delta; - spin_unlock_irqrestore(&freezer_delta_lock, flags); -} - - -/** - * alarm_init - Initialize an alarm structure - * @alarm: ptr to alarm to be initialized - * @type: the type of the alarm - * @function: callback that is run when the alarm fires - */ -void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) -{ - timerqueue_init(&alarm->node); - alarm->function = function; - alarm->type = type; - alarm->state = ALARMTIMER_STATE_INACTIVE; -} - -/** - * alarm_start - Sets an alarm to fire - * @alarm: ptr to alarm to set - * @start: time to run the alarm - */ -void alarm_start(struct alarm *alarm, ktime_t start) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - - spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_active(alarm)) - alarmtimer_remove(base, alarm); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); -} - -/** - * alarm_try_to_cancel - Tries to cancel an alarm timer - * @alarm: ptr to alarm to be canceled - * - * Returns 1 if the timer was canceled, 0 if it was not running, - * and -1 if the callback was running - */ -int alarm_try_to_cancel(struct alarm *alarm) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - int ret = -1; - spin_lock_irqsave(&base->lock, flags); - - if (alarmtimer_callback_running(alarm)) - goto out; - - if (alarmtimer_is_queued(alarm)) { - alarmtimer_remove(base, alarm); - ret = 1; - } else - ret = 0; -out: - spin_unlock_irqrestore(&base->lock, flags); - return ret; -} - - -/** - * alarm_cancel - Spins trying to cancel an alarm timer until it is done - * @alarm: ptr to alarm to be canceled - * - * Returns 1 if the timer was canceled, 0 if it was not active. - */ -int alarm_cancel(struct alarm *alarm) -{ - for (;;) { - int ret = alarm_try_to_cancel(alarm); - if (ret >= 0) - return ret; - cpu_relax(); - } -} - - -u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) -{ - u64 overrun = 1; - ktime_t delta; - - delta = ktime_sub(now, alarm->node.expires); - - if (delta.tv64 < 0) - return 0; - - if (unlikely(delta.tv64 >= interval.tv64)) { - s64 incr = ktime_to_ns(interval); - - overrun = ktime_divns(delta, incr); - - alarm->node.expires = ktime_add_ns(alarm->node.expires, - incr*overrun); - - if (alarm->node.expires.tv64 > now.tv64) - return overrun; - /* - * This (and the ktime_add() below) is the - * correction for exact: - */ - overrun++; - } - - alarm->node.expires = ktime_add(alarm->node.expires, interval); - return overrun; -} - - - - -/** - * clock2alarm - helper that converts from clockid to alarmtypes - * @clockid: clockid. - */ -static enum alarmtimer_type clock2alarm(clockid_t clockid) -{ - if (clockid == CLOCK_REALTIME_ALARM) - return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; -} - -/** - * alarm_handle_timer - Callback for posix timers - * @alarm: alarm that fired - * - * Posix timer callback for expired alarm timers. - */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, - ktime_t now) -{ - struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; - - /* Re-add periodic timers */ - if (ptr->it.alarm.interval.tv64) { - ptr->it_overrun += alarm_forward(alarm, now, - ptr->it.alarm.interval); - return ALARMTIMER_RESTART; - } - return ALARMTIMER_NORESTART; -} - -/** - * alarm_clock_getres - posix getres interface - * @which_clock: clockid - * @tp: timespec to fill - * - * Returns the granularity of underlying alarm base clock - */ -static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) -{ - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - - if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; - - return hrtimer_get_res(baseid, tp); -} - -/** - * alarm_clock_get - posix clock_get interface - * @which_clock: clockid - * @tp: timespec to fill. - * - * Provides the underlying alarm base time. - */ -static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) -{ - struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; - - if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; - - *tp = ktime_to_timespec(base->gettime()); - return 0; -} - -/** - * alarm_timer_create - posix timer_create interface - * @new_timer: k_itimer pointer to manage - * - * Initializes the k_itimer structure. - */ -static int alarm_timer_create(struct k_itimer *new_timer) -{ - enum alarmtimer_type type; - struct alarm_base *base; - - if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; - - if (!capable(CAP_WAKE_ALARM)) - return -EPERM; - - type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; - alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); - return 0; -} - -/** - * alarm_timer_get - posix timer_get interface - * @new_timer: k_itimer pointer - * @cur_setting: itimerspec data to fill - * - * Copies the itimerspec data out from the k_itimer - */ -static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec *cur_setting) -{ - memset(cur_setting, 0, sizeof(struct itimerspec)); - - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; -} - -/** - * alarm_timer_del - posix timer_del interface - * @timr: k_itimer pointer to be deleted - * - * Cancels any programmed alarms for the given timer. - */ -static int alarm_timer_del(struct k_itimer *timr) -{ - if (!rtcdev) - return -ENOTSUPP; - - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - return 0; -} - -/** - * alarm_timer_set - posix timer_set interface - * @timr: k_itimer pointer to be deleted - * @flags: timer flags - * @new_setting: itimerspec to be used - * @old_setting: itimerspec being replaced - * - * Sets the timer to new_setting, and starts the timer. - */ -static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, - struct itimerspec *old_setting) -{ - if (!rtcdev) - return -ENOTSUPP; - - if (old_setting) - alarm_timer_get(timr, old_setting); - - /* If the timer was already set, cancel it */ - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - /* start the timer */ - timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - alarm_start(&timr->it.alarm.alarmtimer, - timespec_to_ktime(new_setting->it_value)); - return 0; -} - -/** - * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep - * @alarm: ptr to alarm that fired - * - * Wakes up the task that set the alarmtimer - */ -static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, - ktime_t now) -{ - struct task_struct *task = (struct task_struct *)alarm->data; - - alarm->data = NULL; - if (task) - wake_up_process(task); - return ALARMTIMER_NORESTART; -} - -/** - * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation - * @alarm: ptr to alarmtimer - * @absexp: absolute expiration time - * - * Sets the alarm timer and sleeps until it is fired or interrupted. - */ -static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) -{ - alarm->data = (void *)current; - do { - set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp); - if (likely(alarm->data)) - schedule(); - - alarm_cancel(alarm); - } while (alarm->data && !signal_pending(current)); - - __set_current_state(TASK_RUNNING); - - return (alarm->data == NULL); -} - - -/** - * update_rmtp - Update remaining timespec value - * @exp: expiration time - * @type: timer type - * @rmtp: user pointer to remaining timepsec value - * - * Helper function that fills in rmtp value with time between - * now and the exp value - */ -static int update_rmtp(ktime_t exp, enum alarmtimer_type type, - struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = ktime_sub(exp, alarm_bases[type].gettime()); - - if (rem.tv64 <= 0) - return 0; - rmt = ktime_to_timespec(rem); - - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; - -} - -/** - * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep - * @restart: ptr to restart block - * - * Handles restarted clock_nanosleep calls - */ -static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) -{ - enum alarmtimer_type type = restart->nanosleep.clockid; - ktime_t exp; - struct timespec __user *rmtp; - struct alarm alarm; - int ret = 0; - - exp.tv64 = restart->nanosleep.expires; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - return ret; -} - -/** - * alarm_timer_nsleep - alarmtimer nanosleep - * @which_clock: clockid - * @flags: determins abstime or relative - * @tsreq: requested sleep time (abs or rel) - * @rmtp: remaining sleep time saved - * - * Handles clock_nanosleep calls against _ALARM clockids - */ -static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsreq, struct timespec __user *rmtp) -{ - enum alarmtimer_type type = clock2alarm(which_clock); - struct alarm alarm; - ktime_t exp; - int ret = 0; - struct restart_block *restart; - - if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; - - if (!capable(CAP_WAKE_ALARM)) - return -EPERM; - - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - - exp = timespec_to_ktime(*tsreq); - /* Convert (if necessary) to absolute time */ - if (flags != TIMER_ABSTIME) { - ktime_t now = alarm_bases[type].gettime(); - exp = ktime_add(now, exp); - } - - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); - - /* abs timers don't set remaining time or restart */ - if (flags == TIMER_ABSTIME) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - restart = ¤t_thread_info()->restart_block; - restart->fn = alarm_timer_nsleep_restart; - restart->nanosleep.clockid = type; - restart->nanosleep.expires = exp.tv64; - restart->nanosleep.rmtp = rmtp; - ret = -ERESTART_RESTARTBLOCK; - -out: - return ret; -} - - -/* Suspend hook structures */ -static const struct dev_pm_ops alarmtimer_pm_ops = { - .suspend = alarmtimer_suspend, -}; - -static struct platform_driver alarmtimer_driver = { - .driver = { - .name = "alarmtimer", - .pm = &alarmtimer_pm_ops, - } -}; - -/** - * alarmtimer_init - Initialize alarm timer code - * - * This function initializes the alarm bases and registers - * the posix clock ids. - */ -static int __init alarmtimer_init(void) -{ - struct platform_device *pdev; - int error = 0; - int i; - struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .nsleep = alarm_timer_nsleep, - }; - - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); - - /* Initialize alarm bases */ - alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; - alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; - alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; - alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; - for (i = 0; i < ALARM_NUMTYPE; i++) { - timerqueue_init_head(&alarm_bases[i].timerqueue); - spin_lock_init(&alarm_bases[i].lock); - hrtimer_init(&alarm_bases[i].timer, - alarm_bases[i].base_clockid, - HRTIMER_MODE_ABS); - alarm_bases[i].timer.function = alarmtimer_fired; - } - - error = alarmtimer_rtc_interface_setup(); - if (error) - return error; - - error = platform_driver_register(&alarmtimer_driver); - if (error) - goto out_if; - - pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); - if (IS_ERR(pdev)) { - error = PTR_ERR(pdev); - goto out_drv; - } - return 0; - -out_drv: - platform_driver_unregister(&alarmtimer_driver); -out_if: - alarmtimer_rtc_interface_remove(); - return error; -} -device_initcall(alarmtimer_init); -/* - * linux/kernel/time/clockevents.c - * - * This file contains functions which manage clock event devices. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ - -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/* The registered clock event devices */ -static LIST_HEAD(clockevent_devices); -static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - -/* Protection for the above */ -static DEFINE_RAW_SPINLOCK(clockevents_lock); - -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) -{ - u64 clc = (u64) latch << evt->shift; - - if (unlikely(!evt->mult)) { - evt->mult = 1; - WARN_ON(1); - } - - do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - - return clc; -} -EXPORT_SYMBOL_GPL(clockevent_delta2ns); - -/** - * clockevents_set_mode - set the operating mode of a clock event device - * @dev: device to modify - * @mode: new mode - * - * Must be called with interrupts disabled ! - */ -void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) -{ - if (dev->mode != mode) { - dev->set_mode(mode, dev); - dev->mode = mode; - - /* - * A nsec2cyc multiplicator of 0 is invalid and we'd crash - * on it, so fix it up and emit a warning: - */ - if (mode == CLOCK_EVT_MODE_ONESHOT) { - if (unlikely(!dev->mult)) { - dev->mult = 1; - WARN_ON(1); - } - } - } -} - -/** - * clockevents_shutdown - shutdown the device and clear next_event - * @dev: device to shutdown - */ -void clockevents_shutdown(struct clock_event_device *dev) -{ - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - dev->next_event.tv64 = KTIME_MAX; -} - -#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - -/* Limit min_delta to a jiffie */ -#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) - -/** - * clockevents_increase_min_delta - raise minimum delta of a clock event device - * @dev: device to increase the minimum delta - * - * Returns 0 on success, -ETIME when the minimum delta reached the limit. - */ -static int clockevents_increase_min_delta(struct clock_event_device *dev) -{ - /* Nothing to do if we already reached the limit */ - if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); - dev->next_event.tv64 = KTIME_MAX; - return -ETIME; - } - - if (dev->min_delta_ns < 5000) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - if (dev->min_delta_ns > MIN_DELTA_LIMIT) - dev->min_delta_ns = MIN_DELTA_LIMIT; - - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); - return 0; -} - -/** - * clockevents_program_min_delta - Set clock event device to the minimum delay. - * @dev: device to program - * - * Returns 0 on success, -ETIME when the retry loop failed. - */ -static int clockevents_program_min_delta(struct clock_event_device *dev) -{ - unsigned long long clc; - int64_t delta; - int i; - - for (i = 0;;) { - delta = dev->min_delta_ns; - dev->next_event = ktime_add_ns(ktime_get(), delta); - - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) - return 0; - - dev->retries++; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - if (dev->set_next_event((unsigned long) clc, dev) == 0) - return 0; - - if (++i > 2) { - /* - * We tried 3 times to program the device with the - * given min_delta_ns. Try to increase the minimum - * delta, if that fails as well get out of here. - */ - if (clockevents_increase_min_delta(dev)) - return -ETIME; - i = 0; - } - } -} - -#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ - -/** - * clockevents_program_min_delta - Set clock event device to the minimum delay. - * @dev: device to program - * - * Returns 0 on success, -ETIME when the retry loop failed. - */ -static int clockevents_program_min_delta(struct clock_event_device *dev) -{ - unsigned long long clc; - int64_t delta; - - delta = dev->min_delta_ns; - dev->next_event = ktime_add_ns(ktime_get(), delta); - - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) - return 0; - - dev->retries++; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - return dev->set_next_event((unsigned long) clc, dev); -} - -#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ - -/** - * clockevents_program_event - Reprogram the clock event device. - * @dev: device to program - * @expires: absolute expiry time (monotonic clock) - * @force: program minimum delay if expires can not be set - * - * Returns 0 on success, -ETIME when the event is in the past. - */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) -{ - unsigned long long clc; - int64_t delta; - int rc; - - if (unlikely(expires.tv64 < 0)) { - WARN_ON_ONCE(1); - return -ETIME; - } - - dev->next_event = expires; - - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) - return 0; - - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) - return dev->set_next_ktime(expires, dev); - - delta = ktime_to_ns(ktime_sub(expires, ktime_get())); - if (delta <= 0) - return force ? clockevents_program_min_delta(dev) : -ETIME; - - delta = min(delta, (int64_t) dev->max_delta_ns); - delta = max(delta, (int64_t) dev->min_delta_ns); - - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); - - return (rc && force) ? clockevents_program_min_delta(dev) : rc; -} - -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - -/* - * Called after a notify add to make devices available which were - * released from the notifier call. - */ -static void clockevents_notify_released(void) -{ - struct clock_event_device *dev; - - while (!list_empty(&clockevents_released)) { - dev = list_entry(clockevents_released.next, - struct clock_event_device, list); - list_del(&dev->list); - list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); - } -} - -/** - * clockevents_register_device - register a clock event device - * @dev: device to register - */ -void clockevents_register_device(struct clock_event_device *dev) -{ - unsigned long flags; - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - if (!dev->cpumask) { - WARN_ON(num_possible_cpus() > 1); - dev->cpumask = cpumask_of(smp_processor_id()); - } - - raw_spin_lock_irqsave(&clockevents_lock, flags); - - list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); - clockevents_notify_released(); - - raw_spin_unlock_irqrestore(&clockevents_lock, flags); -} -EXPORT_SYMBOL_GPL(clockevents_register_device); - -static void clockevents_config(struct clock_event_device *dev, - u32 freq) -{ - u64 sec; - - if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) - return; - - /* - * Calculate the maximum number of seconds we can sleep. Limit - * to 10 minutes for hardware which can program more than - * 32bit ticks so we still get reasonable conversion values. - */ - sec = dev->max_delta_ticks; - do_div(sec, freq); - if (!sec) - sec = 1; - else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) - sec = 600; - - clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); -} - -/** - * clockevents_config_and_register - Configure and register a clock event device - * @dev: device to register - * @freq: The clock frequency - * @min_delta: The minimum clock ticks to program in oneshot mode - * @max_delta: The maximum clock ticks to program in oneshot mode - * - * min/max_delta can be 0 for devices which do not support oneshot mode. - */ -void clockevents_config_and_register(struct clock_event_device *dev, - u32 freq, unsigned long min_delta, - unsigned long max_delta) -{ - dev->min_delta_ticks = min_delta; - dev->max_delta_ticks = max_delta; - clockevents_config(dev, freq); - clockevents_register_device(dev); -} - -/** - * clockevents_update_freq - Update frequency and reprogram a clock event device. - * @dev: device to modify - * @freq: new device frequency - * - * Reconfigure and reprogram a clock event device in oneshot - * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. - */ -int clockevents_update_freq(struct clock_event_device *dev, u32 freq) -{ - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; - - return clockevents_program_event(dev, dev->next_event, false); -} - -/* - * Noop handler when we shut down an event device - */ -void clockevents_handle_noop(struct clock_event_device *dev) -{ -} - -/** - * clockevents_exchange_device - release and request clock devices - * @old: device to release (can be NULL) - * @new: device to request (can be NULL) - * - * Called from the notifier chain. clockevents_lock is held already - */ -void clockevents_exchange_device(struct clock_event_device *old, - struct clock_event_device *new) -{ - unsigned long flags; - - local_irq_save(flags); - /* - * Caller releases a clock event device. We queue it into the - * released list and do a notify add later. - */ - if (old) { - clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); - list_del(&old->list); - list_add(&old->list, &clockevents_released); - } - - if (new) { - BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_shutdown(new); - } - local_irq_restore(flags); -} - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -/** - * clockevents_notify - notification about relevant events - */ -void clockevents_notify(unsigned long reason, void *arg) -{ - struct clock_event_device *dev, *tmp; - unsigned long flags; - int cpu; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); - - switch (reason) { - case CLOCK_EVT_NOTIFY_CPU_DEAD: - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_entry_safe(dev, tmp, &clockevents_released, list) - list_del(&dev->list); - /* - * Now check whether the CPU has left unused per cpu devices - */ - cpu = *((int *)arg); - list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { - if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1 && - !tick_is_broadcast_device(dev)) { - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - list_del(&dev->list); - } - } - break; - default: - break; - } - raw_spin_unlock_irqrestore(&clockevents_lock, flags); -} -EXPORT_SYMBOL_GPL(clockevents_notify); -#endif -/* - * linux/kernel/time/clocksource.c - * - * This file contains the functions which manage clocksource drivers. - * - * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * TODO WishList: - * o Allow clocksource drivers to be unregistered - */ - -#include -#include -#include -#include -#include /* for spin_unlock_irq() using preempt_count() m68k */ -#include -#include - -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); - -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) -{ - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); - -/** - * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks - * @mult: pointer to mult variable - * @shift: pointer to shift variable - * @from: frequency to convert from - * @to: frequency to convert to - * @maxsec: guaranteed runtime conversion range in seconds - * - * The function evaluates the shift/mult pair for the scaled math - * operations of clocksources and clockevents. - * - * @to and @from are frequency values in HZ. For clock sources @to is - * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock - * event @to is the counter frequency and @from is NSEC_PER_SEC. - * - * The @maxsec conversion range argument controls the time frame in - * seconds which must be covered by the runtime conversion with the - * calculated mult and shift factors. This guarantees that no 64bit - * overflow happens when the input value of the conversion is - * multiplied with the calculated mult factor. Larger ranges may - * reduce the conversion accuracy by chosing smaller mult and shift - * factors. - */ -void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) -{ - u64 tmp; - u32 sft, sftacc= 32; - - /* - * Calculate the shift factor which is limiting the conversion - * range: - */ - tmp = ((u64)maxsec * from) >> 32; - while (tmp) { - tmp >>=1; - sftacc--; - } - - /* - * Find the conversion shift/mult pair which has the best - * accuracy and fits the maxsec conversion range: - */ - for (sft = 32; sft > 0; sft--) { - tmp = (u64) to << sft; - tmp += from / 2; - do_div(tmp, from); - if ((tmp >> sftacc) == 0) - break; - } - *mult = tmp; - *shift = sft; -} - -/*[Clocksource internal variables]--------- - * curr_clocksource: - * currently selected clocksource. - * clocksource_list: - * linked list with the registered clocksources - * clocksource_mutex: - * protects manipulations to curr_clocksource and the clocksource_list - * override_name: - * Name of the user-specified clocksource. - */ -static struct clocksource *curr_clocksource; -static LIST_HEAD(clocksource_list); -static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; -static int finished_booting; - -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG -static void clocksource_watchdog_work(struct work_struct *work); - -static LIST_HEAD(watchdog_list); -static struct clocksource *watchdog; -static struct timer_list watchdog_timer; -static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); -static DEFINE_SPINLOCK(watchdog_lock); -static int watchdog_running; -static atomic_t watchdog_reset_pending; - -static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -/* - * Interval: 0.5sec Threshold: 0.0625s - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) - -static void clocksource_watchdog_work(struct work_struct *work) -{ - /* - * If kthread_run fails the next watchdog scan over the - * watchdog_list will find the unstable clock again. - */ - kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); -} - -static void __clocksource_unstable(struct clocksource *cs) -{ - cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - cs->flags |= CLOCK_SOURCE_UNSTABLE; - if (finished_booting) - schedule_work(&watchdog_work); -} - -static void clocksource_unstable(struct clocksource *cs, int64_t delta) -{ - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - __clocksource_unstable(cs); -} - -/** - * clocksource_mark_unstable - mark clocksource unstable via watchdog - * @cs: clocksource to be marked unstable - * - * This function is called instead of clocksource_change_rating from - * cpu hotplug code to avoid a deadlock between the clocksource mutex - * and the cpu hotplug mutex. It defers the update of the clocksource - * to the watchdog thread. - */ -void clocksource_mark_unstable(struct clocksource *cs) -{ - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { - if (list_empty(&cs->wd_list)) - list_add(&cs->wd_list, &watchdog_list); - __clocksource_unstable(cs); - } - spin_unlock_irqrestore(&watchdog_lock, flags); -} - -static void clocksource_watchdog(unsigned long data) -{ - struct clocksource *cs; - cycle_t csnow, wdnow; - int64_t wd_nsec, cs_nsec; - int next_cpu, reset_pending; - - spin_lock(&watchdog_lock); - if (!watchdog_running) - goto out; - - reset_pending = atomic_read(&watchdog_reset_pending); - - list_for_each_entry(cs, &watchdog_list, wd_list) { - - /* Clocksource already marked unstable? */ - if (cs->flags & CLOCK_SOURCE_UNSTABLE) { - if (finished_booting) - schedule_work(&watchdog_work); - continue; - } - - local_irq_disable(); - csnow = cs->read(cs); - wdnow = watchdog->read(watchdog); - local_irq_enable(); - - /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || - atomic_read(&watchdog_reset_pending)) { - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = wdnow; - cs->cs_last = csnow; - continue; - } - - wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, - watchdog->mult, watchdog->shift); - - cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & - cs->mask, cs->mult, cs->shift); - cs->cs_last = csnow; - cs->wd_last = wdnow; - - if (atomic_read(&watchdog_reset_pending)) - continue; - - /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - clocksource_unstable(cs, cs_nsec - wd_nsec); - continue; - } - - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as highres-capable, - * notify the rest of the system as well so that we - * transition into high-res mode: - */ - tick_clock_notify(); - } - } - - /* - * We only clear the watchdog_reset_pending, when we did a - * full cycle through all clocksources. - */ - if (reset_pending) - atomic_dec(&watchdog_reset_pending); - - /* - * Cycle through CPUs to check if the CPUs stay synchronized - * to each other. - */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); -out: - spin_unlock(&watchdog_lock); -} - -static inline void clocksource_start_watchdog(void) -{ - if (watchdog_running || !watchdog || list_empty(&watchdog_list)) - return; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); - watchdog_running = 1; -} - -static inline void clocksource_stop_watchdog(void) -{ - if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) - return; - del_timer(&watchdog_timer); - watchdog_running = 0; -} - -static inline void clocksource_reset_watchdog(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &watchdog_list, wd_list) - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} - -static void clocksource_resume_watchdog(void) -{ - atomic_inc(&watchdog_reset_pending); -} - -static void clocksource_enqueue_watchdog(struct clocksource *cs) -{ - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a clocksource to be watched. */ - list_add(&cs->wd_list, &watchdog_list); - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - } else { - /* cs is a watchdog. */ - if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { - watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } - } - /* Check if the watchdog timer needs to be started. */ - clocksource_start_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); -} - -static void clocksource_dequeue_watchdog(struct clocksource *cs) -{ - struct clocksource *tmp; - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; - } - } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); -} - -static int clocksource_watchdog_kthread(void *data) -{ - struct clocksource *cs, *tmp; - unsigned long flags; - LIST_HEAD(unstable); - - mutex_lock(&clocksource_mutex); - spin_lock_irqsave(&watchdog_lock, flags); - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) - if (cs->flags & CLOCK_SOURCE_UNSTABLE) { - list_del_init(&cs->wd_list); - list_add(&cs->wd_list, &unstable); - } - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); - spin_unlock_irqrestore(&watchdog_lock, flags); - - /* Needs to be done outside of watchdog lock */ - list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { - list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); - } - mutex_unlock(&clocksource_mutex); - return 0; -} - -#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ - -static void clocksource_enqueue_watchdog(struct clocksource *cs) -{ - if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; -} - -static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } -static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } - -#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ - -/** - * clocksource_suspend - suspend the clocksource(s) - */ -void clocksource_suspend(void) -{ - struct clocksource *cs; - - list_for_each_entry_reverse(cs, &clocksource_list, list) - if (cs->suspend) - cs->suspend(cs); -} - -/** - * clocksource_resume - resume the clocksource(s) - */ -void clocksource_resume(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &clocksource_list, list) - if (cs->resume) - cs->resume(cs); - - clocksource_resume_watchdog(); -} - -/** - * clocksource_touch_watchdog - Update watchdog - * - * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. This might fail when the kernel - * was stopped in code which holds watchdog_lock. - */ -void clocksource_touch_watchdog(void) -{ - clocksource_resume_watchdog(); -} - -/** - * clocksource_max_adjustment- Returns max adjustment amount - * @cs: Pointer to clocksource - * - */ -static u32 clocksource_max_adjustment(struct clocksource *cs) -{ - u64 ret; - /* - * We won't try to correct for more then 11% adjustments (110,000 ppm), - */ - ret = (u64)cs->mult * 11; - do_div(ret,100); - return (u32)ret; -} - -/** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource - * - */ -static u64 clocksource_max_deferment(struct clocksource *cs) -{ - u64 max_nsecs, max_cycles; - - /* - * Calculate the maximum number of cycles that we can pass to the - * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) - * which is equivalent to the below. - * max_cycles < (2^63)/(cs->mult + cs->maxadj) - * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) - * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) - * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) - * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) - * Please note that we add 1 to the result of the log2 to account for - * any rounding errors, ensure the above inequality is satisfied and - * no overflow will occur. - */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); - - /* - * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. - * Note: Here we subtract the maxadj to make sure we don't sleep for - * too long if there's a large negative adjustment. - */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, - cs->shift); - - /* - * To ensure that the clocksource does not wrap whilst we are idle, - * limit the time the clocksource can be deferred by 12.5%. Please - * note a margin of 12.5% is used because this can be computed with - * a shift, versus say 10% which would require division. - */ - return max_nsecs - (max_nsecs >> 3); -} - -#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET - -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) -{ - struct clocksource *best, *cs; - - if (!finished_booting || list_empty(&clocksource_list)) - return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); - /* Check for the override clocksource. */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strcmp(cs->name, override_name) != 0) - continue; - /* - * Check to make sure we don't switch to a non-highres - * capable clocksource if the tick code is in oneshot - * mode (highres or nohz) - */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { - /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); - override_name[0] = 0; - } else - /* Override clocksource can be used. */ - best = cs; - break; - } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); - curr_clocksource = best; - timekeeping_notify(curr_clocksource); - } -} - -#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ - -static inline void clocksource_select(void) { } - -#endif - -/* - * clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - mutex_lock(&clocksource_mutex); - curr_clocksource = clocksource_default_clock(); - mutex_unlock(&clocksource_mutex); - - finished_booting = 1; - - /* - * Run the watchdog first to eliminate unstable clock sources - */ - clocksource_watchdog_kthread(NULL); - - mutex_lock(&clocksource_mutex); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -fs_initcall(clocksource_done_booting); - -/* - * Enqueue the clocksource sorted by rating - */ -static void clocksource_enqueue(struct clocksource *cs) -{ - struct list_head *entry = &clocksource_list; - struct clocksource *tmp; - - list_for_each_entry(tmp, &clocksource_list, list) - /* Keep track of the place, where to insert */ - if (tmp->rating >= cs->rating) - entry = &tmp->list; - list_add(&cs->list, entry); -} - -/** - * __clocksource_updatefreq_scale - Used update clocksource with new freq - * @cs: clocksource to be registered - * @scale: Scale factor multiplied against freq to get clocksource hz - * @freq: clocksource frequency (cycles per second) divided by scale - * - * This should only be called from the clocksource->enable() method. - * - * This *SHOULD NOT* be called directly! Please use the - * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. - */ -void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) -{ - u64 sec; - /* - * Calc the maximum number of seconds which we can run before - * wrapping around. For clocksources which have a mask > 32bit - * we need to limit the max sleep time to have a good - * conversion precision. 10 minutes is still a reasonable - * amount. That results in a shift value of 24 for a - * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. We apply the same 12.5% - * margin as we do in clocksource_max_deferment() - */ - sec = (cs->mask - (cs->mask >> 3)); - do_div(sec, freq); - do_div(sec, scale); - if (!sec) - sec = 1; - else if (sec > 600 && cs->mask > UINT_MAX) - sec = 600; - - clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC / scale, sec * scale); - - /* - * for clocksources that have large mults, to avoid overflow. - * Since mult may be adjusted by ntp, add an safety extra margin - * - */ - cs->maxadj = clocksource_max_adjustment(cs); - while ((cs->mult + cs->maxadj < cs->mult) - || (cs->mult - cs->maxadj > cs->mult)) { - cs->mult >>= 1; - cs->shift--; - cs->maxadj = clocksource_max_adjustment(cs); - } - - cs->max_idle_ns = clocksource_max_deferment(cs); -} -EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); - -/** - * __clocksource_register_scale - Used to install new clocksources - * @cs: clocksource to be registered - * @scale: Scale factor multiplied against freq to get clocksource hz - * @freq: clocksource frequency (cycles per second) divided by scale - * - * Returns -EBUSY if registration fails, zero otherwise. - * - * This *SHOULD NOT* be called directly! Please use the - * clocksource_register_hz() or clocksource_register_khz helper functions. - */ -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) -{ - - /* Initialize mult/shift and max_idle_ns */ - __clocksource_updatefreq_scale(cs, scale, freq); - - /* Add clocksource to the clcoksource list */ - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(__clocksource_register_scale); - - -/** - * clocksource_register - Used to install new clocksources - * @cs: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *cs) -{ - /* calculate max adjustment for given mult/shift */ - cs->maxadj = clocksource_max_adjustment(cs); - WARN_ONCE(cs->mult + cs->maxadj < cs->mult, - "Clocksource %s might overflow on 11%% adjustment\n", - cs->name); - - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); - - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL(clocksource_register); - -static void __clocksource_change_rating(struct clocksource *cs, int rating) -{ - list_del(&cs->list); - cs->rating = rating; - clocksource_enqueue(cs); - clocksource_select(); -} - -/** - * clocksource_change_rating - Change the rating of a registered clocksource - * @cs: clocksource to be changed - * @rating: new rating - */ -void clocksource_change_rating(struct clocksource *cs, int rating) -{ - mutex_lock(&clocksource_mutex); - __clocksource_change_rating(cs, rating); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_change_rating); - -/** - * clocksource_unregister - remove a registered clocksource - * @cs: clocksource to be unregistered - */ -void clocksource_unregister(struct clocksource *cs) -{ - mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); - mutex_unlock(&clocksource_mutex); -} -EXPORT_SYMBOL(clocksource_unregister); - -#ifdef CONFIG_SYSFS -/** - * sysfs_show_current_clocksources - sysfs interface for current clocksource - * @dev: unused - * @attr: unused - * @buf: char buffer to be filled with clocksource list - * - * Provides sysfs interface for listing current clocksource. - */ -static ssize_t -sysfs_show_current_clocksources(struct device *dev, - struct device_attribute *attr, char *buf) -{ - ssize_t count = 0; - - mutex_lock(&clocksource_mutex); - count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - mutex_unlock(&clocksource_mutex); - - return count; -} - -/** - * sysfs_override_clocksource - interface for manually overriding clocksource - * @dev: unused - * @attr: unused - * @buf: name of override clocksource - * @count: length of buffer - * - * Takes input from sysfs interface for manually overriding the default - * clocksource selection. - */ -static ssize_t sysfs_override_clocksource(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - - mutex_lock(&clocksource_mutex); - - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); - - mutex_unlock(&clocksource_mutex); - - return ret; -} - -/** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource - * @dev: unused - * @attr: unused - * @buf: char buffer to be filled with clocksource list - * - * Provides sysfs interface for listing registered clocksources - */ -static ssize_t -sysfs_show_available_clocksources(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct clocksource *src; - ssize_t count = 0; - - mutex_lock(&clocksource_mutex); - list_for_each_entry(src, &clocksource_list, list) { - /* - * Don't show non-HRES clocksource if the tick code is - * in one shot mode (highres=on or nohz=on) - */ - if (!tick_oneshot_mode_active() || - (src->flags & CLOCK_SOURCE_VALID_FOR_HRES)) - count += snprintf(buf + count, - max((ssize_t)PAGE_SIZE - count, (ssize_t)0), - "%s ", src->name); - } - mutex_unlock(&clocksource_mutex); - - count += snprintf(buf + count, - max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); - - return count; -} - -/* - * Sysfs setup bits: - */ -static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static DEVICE_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); - -static struct bus_type clocksource_subsys = { - .name = "clocksource", - .dev_name = "clocksource", -}; - -static struct device device_clocksource = { - .id = 0, - .bus = &clocksource_subsys, -}; - -static int __init init_clocksource_sysfs(void) -{ - int error = subsys_system_register(&clocksource_subsys, NULL); - - if (!error) - error = device_register(&device_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_current_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_available_clocksource); - return error; -} - -device_initcall(init_clocksource_sysfs); -#endif /* CONFIG_SYSFS */ - -/** - * boot_override_clocksource - boot clock override - * @str: override name - * - * Takes a clocksource= boot argument and uses it - * as the clocksource override name. - */ -static int __init boot_override_clocksource(char* str) -{ - mutex_lock(&clocksource_mutex); - if (str) - strlcpy(override_name, str, sizeof(override_name)); - mutex_unlock(&clocksource_mutex); - return 1; -} - -__setup("clocksource=", boot_override_clocksource); - -/** - * boot_override_clock - Compatibility layer for deprecated boot option - * @str: override name - * - * DEPRECATED! Takes a clock= boot argument and uses it - * as the clocksource override name - */ -static int __init boot_override_clock(char* str) -{ - if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); - return boot_override_clocksource("acpi_pm"); - } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); - return boot_override_clocksource(str); -} - -__setup("clock=", boot_override_clock); -/*********************************************************************** -* linux/kernel/time/jiffies.c -* -* This file contains the jiffies based clocksource. -* -* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -* -************************************************************************/ -#include -#include -#include -#include - -#include "tick-internal.h" - -/* The Jiffies based clocksource is the lowest common - * denominator clock source which should function on - * all systems. It has the same coarse resolution as - * the timer interrupt frequency HZ and it suffers - * inaccuracies caused by missed or lost timer - * interrupts and the inability for the timer - * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not recommended - * for "tick-less" systems. - */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) - -/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#define JIFFIES_SHIFT 8 - -static cycle_t jiffies_read(struct clocksource *cs) -{ - return (cycle_t) jiffies; -} - -struct clocksource clocksource_jiffies = { - .name = "jiffies", - .rating = 1, /* lowest valid rating*/ - .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ - .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, -}; - -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) -{ - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; -} -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); - -static int __init init_jiffies_clocksource(void) -{ - return clocksource_register(&clocksource_jiffies); -} - -core_initcall(init_jiffies_clocksource); - -struct clocksource * __init __weak clocksource_default_clock(void) -{ - return &clocksource_jiffies; -} -/* - * NTP state machine interfaces and logic. - * - * This code was mainly moved from kernel/timer.c and kernel/time.c - * Please see those files for relevant copyright info and historical - * changelogs. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/* - * NTP timekeeping variables: - */ - -/* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; - -/* ACTHZ period (nsecs): */ -unsigned long tick_nsec; - -u64 tick_length; -static u64 tick_length_base; - -static struct hrtimer leap_timer; - -#define MAX_TICKADJ 500LL /* usecs */ -#define MAX_TICKADJ_SCALED \ - (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) - -/* - * phase-lock loop variables - */ - -/* - * clock synchronization status - * - * (TIME_ERROR prevents overwriting the CMOS clock) - */ -static int time_state = TIME_OK; - -/* clock status bits: */ -int time_status = STA_UNSYNC; - -/* TAI offset (secs): */ -static long time_tai; - -/* time adjustment (nsecs): */ -static s64 time_offset; - -/* pll time constant: */ -static long time_constant = 2; - -/* maximum error (usecs): */ -static long time_maxerror = NTP_PHASE_LIMIT; - -/* estimated error (usecs): */ -static long time_esterror = NTP_PHASE_LIMIT; - -/* frequency offset (scaled nsecs/secs): */ -static s64 time_freq; - -/* time at last adjustment (secs): */ -static long time_reftime; - -static long time_adjust; - -/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ -static s64 ntp_tick_adj; - -#ifdef CONFIG_NTP_PPS - -/* - * The following variables are used when a pulse-per-second (PPS) signal - * is available. They establish the engineering parameters of the clock - * discipline loop when controlled by the PPS signal. - */ -#define PPS_VALID 10 /* PPS signal watchdog max (s) */ -#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ -#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ -#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ -#define PPS_INTCOUNT 4 /* number of consecutive good intervals to - increase pps_shift or consecutive bad - intervals to decrease it */ -#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ - -static int pps_valid; /* signal watchdog counter */ -static long pps_tf[3]; /* phase median filter */ -static long pps_jitter; /* current jitter (ns) */ -static struct timespec pps_fbase; /* beginning of the last freq interval */ -static int pps_shift; /* current interval duration (s) (shift) */ -static int pps_intcnt; /* interval counter */ -static s64 pps_freq; /* frequency offset (scaled ns/s) */ -static long pps_stabil; /* current stability (scaled ns/s) */ - -/* - * PPS signal quality monitors - */ -static long pps_calcnt; /* calibration intervals */ -static long pps_jitcnt; /* jitter limit exceeded */ -static long pps_stbcnt; /* stability limit exceeded */ -static long pps_errcnt; /* calibration errors */ - - -/* PPS kernel consumer compensates the whole phase error immediately. - * Otherwise, reduce the offset by a fixed factor times the time constant. - */ -static inline s64 ntp_offset_chunk(s64 offset) -{ - if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) - return offset; - else - return shift_right(offset, SHIFT_PLL + time_constant); -} - -static inline void pps_reset_freq_interval(void) -{ - /* the PPS calibration interval may end - surprisingly early */ - pps_shift = PPS_INTMIN; - pps_intcnt = 0; -} - -/** - * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the xtime_lock - */ -static inline void pps_clear(void) -{ - pps_reset_freq_interval(); - pps_tf[0] = 0; - pps_tf[1] = 0; - pps_tf[2] = 0; - pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; - pps_freq = 0; -} - -/* Decrease pps_valid to indicate that another second has passed since - * the last PPS signal. When it reaches 0, indicate that PPS signal is - * missing. - * - * Must be called while holding a write on the xtime_lock - */ -static inline void pps_dec_valid(void) -{ - if (pps_valid > 0) - pps_valid--; - else { - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - pps_clear(); - } -} - -static inline void pps_set_freq(s64 freq) -{ - pps_freq = freq; -} - -static inline int is_error_status(int status) -{ - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) - /* PPS signal lost when either PPS time or - * PPS frequency synchronization requested - */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) - /* PPS jitter exceeded when - * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) - == (STA_PPSTIME|STA_PPSJITTER)) - /* PPS wander exceeded or calibration error when - * PPS frequency synchronization requested - */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); -} - -static inline void pps_fill_timex(struct timex *txc) -{ - txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * - PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->jitter = pps_jitter; - if (!(time_status & STA_NANO)) - txc->jitter /= NSEC_PER_USEC; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; -} - -#else /* !CONFIG_NTP_PPS */ - -static inline s64 ntp_offset_chunk(s64 offset) -{ - return shift_right(offset, SHIFT_PLL + time_constant); -} - -static inline void pps_reset_freq_interval(void) {} -static inline void pps_clear(void) {} -static inline void pps_dec_valid(void) {} -static inline void pps_set_freq(s64 freq) {} - -static inline int is_error_status(int status) -{ - return status & (STA_UNSYNC|STA_CLOCKERR); -} - -static inline void pps_fill_timex(struct timex *txc) -{ - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; -} - -#endif /* CONFIG_NTP_PPS */ - -/* - * NTP methods: - */ - -/* - * Update (tick_length, tick_length_base, tick_nsec), based - * on (tick_usec, ntp_tick_adj, time_freq): - */ -static void ntp_update_frequency(void) -{ - u64 second_length; - u64 new_base; - - second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; - - second_length += ntp_tick_adj; - second_length += time_freq; - - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - new_base = div_u64(second_length, NTP_INTERVAL_FREQ); - - /* - * Don't wait for the next second_overflow, apply - * the change to the tick length immediately: - */ - tick_length += new_base - tick_length_base; - tick_length_base = new_base; -} - -static inline s64 ntp_update_offset_fll(s64 offset64, long secs) -{ - time_status &= ~STA_MODE; - - if (secs < MINSEC) - return 0; - - if (!(time_status & STA_FLL) && (secs <= MAXSEC)) - return 0; - - time_status |= STA_MODE; - - return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); -} - -static void ntp_update_offset(long offset) -{ - s64 freq_adj; - s64 offset64; - long secs; - - if (!(time_status & STA_PLL)) - return; - - if (!(time_status & STA_NANO)) - offset *= NSEC_PER_USEC; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - offset = min(offset, MAXPHASE); - offset = max(offset, -MAXPHASE); - - /* - * Select how the frequency is to be controlled - * and in which mode (PLL or FLL). - */ - secs = get_seconds() - time_reftime; - if (unlikely(time_status & STA_FREQHOLD)) - secs = 0; - - time_reftime = get_seconds(); - - offset64 = offset; - freq_adj = ntp_update_offset_fll(offset64, secs); - - /* - * Clamp update interval to reduce PLL gain with low - * sampling rate (e.g. intermittent network connection) - * to avoid instability. - */ - if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) - secs = 1 << (SHIFT_PLL + 1 + time_constant); - - freq_adj += (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); - - freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); - - time_freq = max(freq_adj, -MAXFREQ_SCALED); - - time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); -} - -/** - * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock - */ -void ntp_clear(void) -{ - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - - ntp_update_frequency(); - - tick_length = tick_length_base; - time_offset = 0; - - /* Clear PPS state variables */ - pps_clear(); -} - -/* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. - */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) -{ - enum hrtimer_restart res = HRTIMER_NORESTART; - - write_seqlock(&xtime_lock); - - switch (time_state) { - case TIME_OK: - break; - case TIME_INS: - timekeeping_leap_insert(-1); - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; - break; - case TIME_DEL: - timekeeping_leap_insert(1); - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); - break; - case TIME_OOP: - time_tai++; - time_state = TIME_WAIT; - /* fall through */ - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - } - - write_sequnlock(&xtime_lock); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; - - /* Bump the maxerror field */ - time_maxerror += MAXFREQ / NSEC_PER_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* Compute the phase adjustment for the next second */ - tick_length = tick_length_base; - - delta = ntp_offset_chunk(time_offset); - time_offset -= delta; - tick_length += delta; - - /* Check PPS signal */ - pps_dec_valid(); - - if (!time_adjust) - return; - - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; - return; - } - - if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; - return; - } - - tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) - << NTP_SCALE_SHIFT; - time_adjust = 0; -} - -#ifdef CONFIG_GENERIC_CMOS_UPDATE - -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; - -static void sync_cmos_clock(struct work_struct *work); - -static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); - -static void sync_cmos_clock(struct work_struct *work) -{ - struct timespec now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) { - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - } - - getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) - fail = update_persistent_clock(now); - - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); -} - -static void notify_cmos_timer(void) -{ - if (!no_sync_cmos_clock) - schedule_delayed_work(&sync_cmos_work, 0); -} - -#else -static inline void notify_cmos_timer(void) { } -#endif - -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} - -/* - * Propagate a new txc->status value into the NTP state: - */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) -{ - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - /* restart PPS frequency calibration */ - pps_reset_freq_interval(); - } - - /* - * If we turn on PLL adjustments then reset the - * reference time to current time. - */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = get_seconds(); - - /* only set allowed bits */ - time_status &= STA_RONLY; - time_status |= txc->status & ~STA_RONLY; - - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } -} -/* - * Called with the xtime lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) -{ - if (txc->modes & ADJ_STATUS) - process_adj_status(txc, ts); - - if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; - - if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; - - if (txc->modes & ADJ_FREQUENCY) { - time_freq = txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - /* update pps_freq */ - pps_set_freq(time_freq); - } - - if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; - - if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; - - if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); - } - - if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; - - if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); - - if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; - - if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); -} - -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - struct timespec ts; - int result; - - /* Validate the data before disabling interrupts */ - if (txc->modes & ADJ_ADJTIME) { - /* singleshot must not be used with any other mode bits */ - if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) - return -EINVAL; - if (!(txc->modes & ADJ_OFFSET_READONLY) && - !capable(CAP_SYS_TIME)) - return -EPERM; - } else { - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* - * if the quartz is off by more than 10% then - * something is VERY wrong! - */ - if (txc->modes & ADJ_TICK && - (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ)) - return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); - } - - if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!capable(CAP_SYS_TIME)) - return -EPERM; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - result = timekeeping_inject_offset(&delta); - if (result) - return result; - } - - getnstimeofday(&ts); - - write_seqlock_irq(&xtime_lock); - - if (txc->modes & ADJ_ADJTIME) { - long save_adjust = time_adjust; - - if (!(txc->modes & ADJ_OFFSET_READONLY)) { - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - ntp_update_frequency(); - } - txc->offset = save_adjust; - } else { - - /* If there are input parameters, then process them: */ - if (txc->modes) - process_adjtimex_modes(txc, &ts); - - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } - - result = time_state; /* mostly `TIME_OK' */ - /* check for errors */ - if (is_error_status(time_status)) - result = TIME_ERROR; - - txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * - PPM_SCALE_INV, NTP_SCALE_SHIFT); - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = 1; - txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; - txc->tick = tick_usec; - txc->tai = time_tai; - - /* fill PPS status fields */ - pps_fill_timex(txc); - - write_sequnlock_irq(&xtime_lock); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; - if (!(time_status & STA_NANO)) - txc->time.tv_usec /= NSEC_PER_USEC; - - notify_cmos_timer(); - - return result; -} - -#ifdef CONFIG_NTP_PPS - -/* actually struct pps_normtime is good old struct timespec, but it is - * semantically different (and it is the reason why it was invented): - * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] - * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ -struct pps_normtime { - __kernel_time_t sec; /* seconds */ - long nsec; /* nanoseconds */ -}; - -/* normalize the timestamp so that nsec is in the - ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ -static inline struct pps_normtime pps_normalize_ts(struct timespec ts) -{ - struct pps_normtime norm = { - .sec = ts.tv_sec, - .nsec = ts.tv_nsec - }; - - if (norm.nsec > (NSEC_PER_SEC >> 1)) { - norm.nsec -= NSEC_PER_SEC; - norm.sec++; - } - - return norm; -} - -/* get current phase correction and jitter */ -static inline long pps_phase_filter_get(long *jitter) -{ - *jitter = pps_tf[0] - pps_tf[1]; - if (*jitter < 0) - *jitter = -*jitter; - - /* TODO: test various filters */ - return pps_tf[0]; -} - -/* add the sample to the phase filter */ -static inline void pps_phase_filter_add(long err) -{ - pps_tf[2] = pps_tf[1]; - pps_tf[1] = pps_tf[0]; - pps_tf[0] = err; -} - -/* decrease frequency calibration interval length. - * It is halved after four consecutive unstable intervals. - */ -static inline void pps_dec_freq_interval(void) -{ - if (--pps_intcnt <= -PPS_INTCOUNT) { - pps_intcnt = -PPS_INTCOUNT; - if (pps_shift > PPS_INTMIN) { - pps_shift--; - pps_intcnt = 0; - } - } -} - -/* increase frequency calibration interval length. - * It is doubled after four consecutive stable intervals. - */ -static inline void pps_inc_freq_interval(void) -{ - if (++pps_intcnt >= PPS_INTCOUNT) { - pps_intcnt = PPS_INTCOUNT; - if (pps_shift < PPS_INTMAX) { - pps_shift++; - pps_intcnt = 0; - } - } -} - -/* update clock frequency based on MONOTONIC_RAW clock PPS signal - * timestamps - * - * At the end of the calibration interval the difference between the - * first and last MONOTONIC_RAW clock timestamps divided by the length - * of the interval becomes the frequency update. If the interval was - * too long, the data are discarded. - * Returns the difference between old and new frequency values. - */ -static long hardpps_update_freq(struct pps_normtime freq_norm) -{ - long delta, delta_mod; - s64 ftemp; - - /* check if the frequency interval was too long */ - if (freq_norm.sec > (2 << pps_shift)) { - time_status |= STA_PPSERROR; - pps_errcnt++; - pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); - return 0; - } - - /* here the raw frequency offset and wander (stability) is - * calculated. If the wander is less than the wander threshold - * the interval is increased; otherwise it is decreased. - */ - ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, - freq_norm.sec); - delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); - pps_freq = ftemp; - if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); - time_status |= STA_PPSWANDER; - pps_stbcnt++; - pps_dec_freq_interval(); - } else { /* good sample */ - pps_inc_freq_interval(); - } - - /* the stability metric is calculated as the average of recent - * frequency changes, but is used only for performance - * monitoring - */ - delta_mod = delta; - if (delta_mod < 0) - delta_mod = -delta_mod; - pps_stabil += (div_s64(((s64)delta_mod) << - (NTP_SCALE_SHIFT - SHIFT_USEC), - NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; - - /* if enabled, the system clock frequency is updated */ - if ((time_status & STA_PPSFREQ) != 0 && - (time_status & STA_FREQHOLD) == 0) { - time_freq = pps_freq; - ntp_update_frequency(); - } - - return delta; -} - -/* correct REALTIME clock phase error against PPS signal */ -static void hardpps_update_phase(long error) -{ - long correction = -error; - long jitter; - - /* add the sample to the median filter */ - pps_phase_filter_add(correction); - correction = pps_phase_filter_get(&jitter); - - /* Nominal jitter is due to PPS signal noise. If it exceeds the - * threshold, the sample is discarded; otherwise, if so enabled, - * the time offset is updated. - */ - if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); - time_status |= STA_PPSJITTER; - pps_jitcnt++; - } else if (time_status & STA_PPSTIME) { - /* correct the time using the phase offset */ - time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, - NTP_INTERVAL_FREQ); - /* cancel running adjtime() */ - time_adjust = 0; - } - /* update jitter */ - pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; -} - -/* - * hardpps() - discipline CPU clock oscillator to external PPS signal - * - * This routine is called at each PPS signal arrival in order to - * discipline the CPU clock oscillator to the PPS signal. It takes two - * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former - * is used to correct clock phase error and the latter is used to - * correct the frequency. - * - * This code is based on David Mills's reference nanokernel - * implementation. It was mostly rewritten but keeps the same idea. - */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) -{ - struct pps_normtime pts_norm, freq_norm; - unsigned long flags; - - pts_norm = pps_normalize_ts(*phase_ts); - - write_seqlock_irqsave(&xtime_lock, flags); - - /* clear the error bits, they will be set again if needed */ - time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); - - /* indicate signal presence */ - time_status |= STA_PPSSIGNAL; - pps_valid = PPS_VALID; - - /* when called for the first time, - * just start the frequency interval */ - if (unlikely(pps_fbase.tv_sec == 0)) { - pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); - return; - } - - /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); - - /* check that the signal is in the range - * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ - if ((freq_norm.sec == 0) || - (freq_norm.nsec > MAXFREQ * freq_norm.sec) || - (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { - time_status |= STA_PPSJITTER; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); - pr_err("hardpps: PPSJITTER: bad pulse\n"); - return; - } - - /* signal is ok */ - - /* check if the current frequency interval is finished */ - if (freq_norm.sec >= (1 << pps_shift)) { - pps_calcnt++; - /* restart the frequency calibration interval */ - pps_fbase = *raw_ts; - hardpps_update_freq(freq_norm); - } - - hardpps_update_phase(pts_norm.nsec); - - write_sequnlock_irqrestore(&xtime_lock, flags); -} -EXPORT_SYMBOL(hardpps); - -#endif /* CONFIG_NTP_PPS */ - -static int __init ntp_tick_adj_setup(char *str) -{ - ntp_tick_adj = simple_strtol(str, NULL, 0); - ntp_tick_adj <<= NTP_SCALE_SHIFT; - - return 1; -} - -__setup("ntp_tick_adj=", ntp_tick_adj_setup); - -void __init ntp_init(void) -{ - ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; -} -/* - * posix-clock.c - support for dynamic clock devices - * - * Copyright (C) 2010 OMICRON electronics GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include -#include -#include -#include -#include -#include -#include - -static void delete_clock(struct kref *kref); - -/* - * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. - */ -static struct posix_clock *get_posix_clock(struct file *fp) -{ - struct posix_clock *clk = fp->private_data; - - down_read(&clk->rwsem); - - if (!clk->zombie) - return clk; - - up_read(&clk->rwsem); - - return NULL; -} - -static void put_posix_clock(struct posix_clock *clk) -{ - up_read(&clk->rwsem); -} - -static ssize_t posix_clock_read(struct file *fp, char __user *buf, - size_t count, loff_t *ppos) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -EINVAL; - - if (!clk) - return -ENODEV; - - if (clk->ops.read) - err = clk->ops.read(clk, fp->f_flags, buf, count); - - put_posix_clock(clk); - - return err; -} - -static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) -{ - struct posix_clock *clk = get_posix_clock(fp); - int result = 0; - - if (!clk) - return -ENODEV; - - if (clk->ops.poll) - result = clk->ops.poll(clk, fp, wait); - - put_posix_clock(clk); - - return result; -} - -static int posix_clock_fasync(int fd, struct file *fp, int on) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = 0; - - if (!clk) - return -ENODEV; - - if (clk->ops.fasync) - err = clk->ops.fasync(clk, fd, fp, on); - - put_posix_clock(clk); - - return err; -} - -static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENODEV; - - if (!clk) - return -ENODEV; - - if (clk->ops.mmap) - err = clk->ops.mmap(clk, vma); - - put_posix_clock(clk); - - return err; -} - -static long posix_clock_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); - - put_posix_clock(clk); - - return err; -} - -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - -static int posix_clock_open(struct inode *inode, struct file *fp) -{ - int err; - struct posix_clock *clk = - container_of(inode->i_cdev, struct posix_clock, cdev); - - down_read(&clk->rwsem); - - if (clk->zombie) { - err = -ENODEV; - goto out; - } - if (clk->ops.open) - err = clk->ops.open(clk, fp->f_mode); - else - err = 0; - - if (!err) { - kref_get(&clk->kref); - fp->private_data = clk; - } -out: - up_read(&clk->rwsem); - return err; -} - -static int posix_clock_release(struct inode *inode, struct file *fp) -{ - struct posix_clock *clk = fp->private_data; - int err = 0; - - if (clk->ops.release) - err = clk->ops.release(clk); - - kref_put(&clk->kref, delete_clock); - - fp->private_data = NULL; - - return err; -} - -static const struct file_operations posix_clock_file_operations = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = posix_clock_read, - .poll = posix_clock_poll, - .unlocked_ioctl = posix_clock_ioctl, - .open = posix_clock_open, - .release = posix_clock_release, - .fasync = posix_clock_fasync, - .mmap = posix_clock_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif -}; - -int posix_clock_register(struct posix_clock *clk, dev_t devid) -{ - int err; - - kref_init(&clk->kref); - init_rwsem(&clk->rwsem); - - cdev_init(&clk->cdev, &posix_clock_file_operations); - clk->cdev.owner = clk->ops.owner; - err = cdev_add(&clk->cdev, devid, 1); - - return err; -} -EXPORT_SYMBOL_GPL(posix_clock_register); - -static void delete_clock(struct kref *kref) -{ - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - - if (clk->release) - clk->release(clk); -} - -void posix_clock_unregister(struct posix_clock *clk) -{ - cdev_del(&clk->cdev); - - down_write(&clk->rwsem); - clk->zombie = true; - up_write(&clk->rwsem); - - kref_put(&clk->kref, delete_clock); -} -EXPORT_SYMBOL_GPL(posix_clock_unregister); - -struct posix_clock_desc { - struct file *fp; - struct posix_clock *clk; -}; - -static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) -{ - struct file *fp = fget(CLOCKID_TO_FD(id)); - int err = -EINVAL; - - if (!fp) - return err; - - if (fp->f_op->open != posix_clock_open || !fp->private_data) - goto out; - - cd->fp = fp; - cd->clk = get_posix_clock(fp); - - err = cd->clk ? 0 : -ENODEV; -out: - if (err) - fput(fp); - return err; -} - -static void put_clock_desc(struct posix_clock_desc *cd) -{ - put_posix_clock(cd->clk); - fput(cd->fp); -} - -static int pc_clock_adjtime(clockid_t id, struct timex *tx) -{ - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { - err = -EACCES; - goto out; - } - - if (cd.clk->ops.clock_adjtime) - err = cd.clk->ops.clock_adjtime(cd.clk, tx); - else - err = -EOPNOTSUPP; -out: - put_clock_desc(&cd); - - return err; -} - -static int pc_clock_gettime(clockid_t id, struct timespec *ts) -{ - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.clock_gettime) - err = cd.clk->ops.clock_gettime(cd.clk, ts); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_clock_getres(clockid_t id, struct timespec *ts) -{ - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.clock_getres) - err = cd.clk->ops.clock_getres(cd.clk, ts); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_clock_settime(clockid_t id, const struct timespec *ts) -{ - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { - err = -EACCES; - goto out; - } - - if (cd.clk->ops.clock_settime) - err = cd.clk->ops.clock_settime(cd.clk, ts); - else - err = -EOPNOTSUPP; -out: - put_clock_desc(&cd); - - return err; -} - -static int pc_timer_create(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_create) - err = cd.clk->ops.timer_create(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_timer_delete(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_delete) - err = cd.clk->ops.timer_delete(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - - if (get_clock_desc(id, &cd)) - return; - - if (cd.clk->ops.timer_gettime) - cd.clk->ops.timer_gettime(cd.clk, kit, ts); - - put_clock_desc(&cd); -} - -static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec *ts, struct itimerspec *old) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_settime) - err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, - .timer_create = pc_timer_create, - .timer_set = pc_timer_settime, - .timer_del = pc_timer_delete, - .timer_get = pc_timer_gettime, -}; -/* - * linux/kernel/time/tick-broadcast.c - * - * This file contains functions which emulate a local clock-event - * device via a broadcast event source. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/* - * Broadcast support for broken x86 hardware, where the local apic - * timer stops in C3 state. - */ - -static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); -static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); -static int tick_broadcast_force; - -#ifdef CONFIG_TICK_ONESHOT -static void tick_broadcast_clear_oneshot(int cpu); -#else -static inline void tick_broadcast_clear_oneshot(int cpu) { } -#endif - -/* - * Debugging: see timer_list.c - */ -struct tick_device *tick_get_broadcast_device(void) -{ - return &tick_broadcast_device; -} - -struct cpumask *tick_get_broadcast_mask(void) -{ - return to_cpumask(tick_broadcast_mask); -} - -/* - * Start the device in periodic mode - */ -static void tick_broadcast_start_periodic(struct clock_event_device *bc) -{ - if (bc) - tick_setup_periodic(bc, 1); -} - -/* - * Check, if the device can be utilized as broadcast device: - */ -int tick_check_broadcast_device(struct clock_event_device *dev) -{ - if ((tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; - - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); - tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) - tick_broadcast_start_periodic(dev); - return 1; -} - -/* - * Check, if the device is the broadcast device - */ -int tick_is_broadcast_device(struct clock_event_device *dev) -{ - return (dev && tick_broadcast_device.evtdev == dev); -} - -/* - * Check, if the device is disfunctional and a place holder, which - * needs to be handled by the broadcast device. - */ -int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - /* - * Devices might be registered with both periodic and oneshot - * mode disabled. This signals, that the device needs to be - * operated from the broadcast device and is a placeholder for - * the cpu local device. - */ - if (!tick_device_is_functional(dev)) { - dev->event_handler = tick_handle_periodic; - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); - ret = 1; - } else { - /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. - */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); - - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); - tick_broadcast_clear_oneshot(cpu); - } - } - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); - return ret; -} - -/* - * Broadcast the event to the cpus, which are set in the mask (mangled). - */ -static void tick_do_broadcast(struct cpumask *mask) -{ - int cpu = smp_processor_id(); - struct tick_device *td; - - /* - * Check, if the current cpu is in the mask - */ - if (cpumask_test_cpu(cpu, mask)) { - cpumask_clear_cpu(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); - } - - if (!cpumask_empty(mask)) { - /* - * It might be necessary to actually check whether the devices - * have different broadcast functions. For now, just use the - * one of the first device. This works as long as we have this - * misfeature only on x86 (lapic) - */ - td = &per_cpu(tick_cpu_device, cpumask_first(mask)); - td->evtdev->broadcast(mask); - } -} - -/* - * Periodic broadcast: - * - invoke the broadcast handlers - */ -static void tick_do_periodic_broadcast(void) -{ - raw_spin_lock(&tick_broadcast_lock); - - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); - - raw_spin_unlock(&tick_broadcast_lock); -} - -/* - * Event handler for periodic broadcast ticks - */ -static void tick_handle_periodic_broadcast(struct clock_event_device *dev) -{ - ktime_t next; - - tick_do_periodic_broadcast(); - - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; - - /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event already expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. - */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); - - if (!clockevents_program_event(dev, next, false)) - return; - tick_do_periodic_broadcast(); - } -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop - */ -static void tick_do_broadcast_on_off(unsigned long *reason) -{ - struct clock_event_device *bc, *dev; - struct tick_device *td; - unsigned long flags; - int cpu, bc_stopped; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - bc = tick_broadcast_device.evtdev; - - /* - * Is the device not affected by the powerstate ? - */ - if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; - - if (!tick_device_is_functional(dev)) - goto out; - - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); - - switch (*reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) - clockevents_shutdown(dev); - } - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - tick_broadcast_force = 1; - break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) - tick_setup_periodic(dev, 0); - } - break; - } - - if (cpumask_empty(tick_get_broadcast_mask())) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); - } -out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop. - */ -void tick_broadcast_on_off(unsigned long reason, int *oncpu) -{ - if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) - printk(KERN_ERR "tick-broadcast: ignoring broadcast for " - "offline CPU #%d\n", *oncpu); - else - tick_do_broadcast_on_off(&reason); -} - -/* - * Set the periodic handler depending on broadcast on/off - */ -void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) -{ - if (!broadcast) - dev->event_handler = tick_handle_periodic; - else - dev->event_handler = tick_handle_periodic_broadcast; -} - -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int *cpup) -{ - struct clock_event_device *bc; - unsigned long flags; - unsigned int cpu = *cpup; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); - - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) - clockevents_shutdown(bc); - } - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -void tick_suspend_broadcast(void) -{ - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - if (bc) - clockevents_shutdown(bc); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -int tick_resume_broadcast(void) -{ - struct clock_event_device *bc; - unsigned long flags; - int broadcast = 0; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - - if (bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); - - switch (tick_broadcast_device.mode) { - case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) - tick_broadcast_start_periodic(bc); - broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); - break; - case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); - break; - } - } - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); - - return broadcast; -} - - -#ifdef CONFIG_TICK_ONESHOT - -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); - -/* - * Exposed for debugging: see timer_list.c - */ -struct cpumask *tick_get_broadcast_oneshot_mask(void) -{ - return to_cpumask(tick_broadcast_oneshot_mask); -} - -static int tick_broadcast_set_event(ktime_t expires, int force) -{ - struct clock_event_device *bc = tick_broadcast_device.evtdev; - - return clockevents_program_event(bc, expires, force); -} - -int tick_resume_broadcast_oneshot(struct clock_event_device *bc) -{ - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return 0; -} - -/* - * Called from irq_enter() when idle was interrupted to reenable the - * per cpu device. - */ -void tick_check_oneshot_broadcast(int cpu) -{ - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); - - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); - } -} - -/* - * Handle oneshot mode broadcasting - */ -static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) -{ - struct tick_device *td; - ktime_t now, next_event; - int cpu; - - raw_spin_lock(&tick_broadcast_lock); -again: - dev->next_event.tv64 = KTIME_MAX; - next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); - now = ktime_get(); - /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { - td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); - else if (td->evtdev->next_event.tv64 < next_event.tv64) - next_event.tv64 = td->evtdev->next_event.tv64; - } - - /* - * Wakeup the cpus which have an expired event. - */ - tick_do_broadcast(to_cpumask(tmpmask)); - - /* - * Two reasons for reprogram: - * - * - The global event did not expire any CPU local - * events. This happens in dyntick mode, as the maximum PIT - * delta is quite small. - * - * - There are pending events on sleeping CPUs which were not - * in the event mask - */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(next_event, 0)) - goto again; - } - raw_spin_unlock(&tick_broadcast_lock); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop - */ -void tick_broadcast_oneshot_control(unsigned long reason) -{ - struct clock_event_device *bc, *dev; - struct tick_device *td; - unsigned long flags; - int cpu; - - /* - * Periodic mode does not care about the enter/exit of power - * states - */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; - - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; - - bc = tick_broadcast_device.evtdev; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); - } - } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); - } - } - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Reset the one shot broadcast for a cpu - * - * Called with tick_broadcast_lock held - */ -static void tick_broadcast_clear_oneshot(int cpu) -{ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); -} - -static void tick_broadcast_init_next_event(struct cpumask *mask, - ktime_t expires) -{ - struct tick_device *td; - int cpu; - - for_each_cpu(cpu, mask) { - td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev) - td->evtdev->next_event = expires; - } -} - -/** - * tick_broadcast_setup_oneshot - setup the broadcast device - */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - int cpu = smp_processor_id(); - - /* Set it up only once ! */ - if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - /* Take the do_timer update */ - tick_do_timer_cpu = cpu; - - /* - * We must be careful here. There might be other CPUs - * waiting for periodic broadcast. We need to set the - * oneshot_mask bits for those and program the - * broadcast device to fire. - */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); - - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { - tick_broadcast_init_next_event(to_cpumask(tmpmask), - tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); - } else - bc->next_event.tv64 = KTIME_MAX; - } else { - /* - * The first cpu which switches to oneshot mode sets - * the bit for all other cpus which are in the general - * (periodic) broadcast mask. So the bit is set and - * would prevent the first broadcast enter after this - * to program the bc device. - */ - tick_broadcast_clear_oneshot(cpu); - } -} - -/* - * Select oneshot operating mode for the broadcast device - */ -void tick_broadcast_switch_to_oneshot(void) -{ - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; - bc = tick_broadcast_device.evtdev; - if (bc) - tick_broadcast_setup_oneshot(bc); - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - - -/* - * Remove a dead CPU from broadcasting - */ -void tick_shutdown_broadcast_oneshot(unsigned int *cpup) -{ - unsigned long flags; - unsigned int cpu = *cpup; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! - */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Check, whether the broadcast device is in one shot mode - */ -int tick_broadcast_oneshot_active(void) -{ - return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; -} - -/* - * Check whether the broadcast device supports oneshot. - */ -bool tick_broadcast_oneshot_available(void) -{ - struct clock_event_device *bc = tick_broadcast_device.evtdev; - - return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; -} - -#endif -/* - * linux/kernel/time/tick-common.c - * - * This file contains the base functions to manage periodic tick - * related events. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "tick-internal.h" - -/* - * Tick devices - */ -DEFINE_PER_CPU(struct tick_device, tick_cpu_device); -/* - * Tick next event: keeps track of the tick time - */ -ktime_t tick_next_period; -ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); - -/* - * Debugging: see timer_list.c - */ -struct tick_device *tick_get_device(int cpu) -{ - return &per_cpu(tick_cpu_device, cpu); -} - -/** - * tick_is_oneshot_available - check for a oneshot capable event device - */ -int tick_is_oneshot_available(void) -{ - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - - if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) - return 0; - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 1; - return tick_broadcast_oneshot_available(); -} - -/* - * Periodic tick - */ -static void tick_periodic(int cpu) -{ - if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); - - /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, tick_period); - - do_timer(1); - write_sequnlock(&xtime_lock); - } - - update_process_times(user_mode(get_irq_regs())); - profile_tick(CPU_PROFILING); -} - -/* - * Event handler for periodic ticks - */ -void tick_handle_periodic(struct clock_event_device *dev) -{ - int cpu = smp_processor_id(); - ktime_t next; - - tick_periodic(cpu); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); - for (;;) { - if (!clockevents_program_event(dev, next, false)) - return; - /* - * Have to be careful here. If we're in oneshot mode, - * before we call tick_periodic() in a loop, we need - * to be sure we're using a real hardware clocksource. - * Otherwise we could get trapped in an infinite - * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing - * the loop to trigger again and again. - */ - if (timekeeping_valid_for_hres()) - tick_periodic(cpu); - next = ktime_add(next, tick_period); - } -} - -/* - * Setup the device for a periodic tick - */ -void tick_setup_periodic(struct clock_event_device *dev, int broadcast) -{ - tick_set_periodic_handler(dev, broadcast); - - /* Broadcast setup ? */ - if (!tick_device_is_functional(dev)) - return; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !tick_broadcast_oneshot_active()) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); - } else { - unsigned long seq; - ktime_t next; - - do { - seq = read_seqbegin(&xtime_lock); - next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); - - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - - for (;;) { - if (!clockevents_program_event(dev, next, false)) - return; - next = ktime_add(next, tick_period); - } - } -} - -/* - * Setup the tick device - */ -static void tick_setup_device(struct tick_device *td, - struct clock_event_device *newdev, int cpu, - const struct cpumask *cpumask) -{ - ktime_t next_event; - void (*handler)(struct clock_event_device *) = NULL; - - /* - * First device setup ? - */ - if (!td->evtdev) { - /* - * If no cpu took the do_timer update, assign it to - * this cpu: - */ - if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; - tick_next_period = ktime_get(); - tick_period = ktime_set(0, NSEC_PER_SEC / HZ); - } - - /* - * Startup in periodic mode first. - */ - td->mode = TICKDEV_MODE_PERIODIC; - } else { - handler = td->evtdev->event_handler; - next_event = td->evtdev->next_event; - td->evtdev->event_handler = clockevents_handle_noop; - } - - td->evtdev = newdev; - - /* - * When the device is not per cpu, pin the interrupt to the - * current cpu: - */ - if (!cpumask_equal(newdev->cpumask, cpumask)) - irq_set_affinity(newdev->irq, cpumask); - - /* - * When global broadcasting is active, check if the current - * device is registered as a placeholder for broadcast mode. - * This allows us to handle this x86 misfeature in a generic - * way. - */ - if (tick_device_uses_broadcast(newdev, cpu)) - return; - - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(newdev, 0); - else - tick_setup_oneshot(newdev, handler, next_event); -} - -/* - * Check, if the new registered device should be used. - */ -static int tick_check_new_device(struct clock_event_device *newdev) -{ - struct clock_event_device *curdev; - struct tick_device *td; - int cpu, ret = NOTIFY_OK; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); - - cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, newdev->cpumask)) - goto out_bc; - - td = &per_cpu(tick_cpu_device, cpu); - curdev = td->evtdev; - - /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } - - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } - - /* - * Replace the eventually existing device by the new - * device. If the current device is the broadcast device, do - * not give it back to the clockevents layer ! - */ - if (tick_is_broadcast_device(curdev)) { - clockevents_shutdown(curdev); - curdev = NULL; - } - clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); - if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) - tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; - -out_bc: - /* - * Can the new device be used as a broadcast device ? - */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; -} - -/* - * Transfer the do_timer job away from a dying cpu. - * - * Called with interrupts disabled. - */ -static void tick_handover_do_timer(int *cpup) -{ - if (*cpup == tick_do_timer_cpu) { - int cpu = cpumask_first(cpu_online_mask); - - tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : - TICK_DO_TIMER_NONE; - } -} - -/* - * Shutdown an event device on a given cpu: - * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. - */ -static void tick_shutdown(unsigned int *cpup) -{ - struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); - struct clock_event_device *dev = td->evtdev; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); - td->mode = TICKDEV_MODE_PERIODIC; - if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - dev->mode = CLOCK_EVT_MODE_UNUSED; - clockevents_exchange_device(dev, NULL); - td->evtdev = NULL; - } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); -} - -static void tick_suspend(void) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); - clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); -} - -static void tick_resume(void) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - int broadcast = tick_resume_broadcast(); - - raw_spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); - - if (!broadcast) { - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); - } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); -} - -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - -/** - * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework - */ -void __init tick_init(void) -{ - clockevents_register_notifier(&tick_notifier); -} -/* - * linux/kernel/time/tick-oneshot.c - * - * This file contains functions which manage high resolution tick - * related events. - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner - * - * This code is licenced under the GPL version 2. For details see - * kernel-base/COPYING. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "tick-internal.h" - -/** - * tick_program_event - */ -int tick_program_event(ktime_t expires, int force) -{ - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - - return clockevents_program_event(dev, expires, force); -} - -/** - * tick_resume_onshot - resume oneshot mode - */ -void tick_resume_oneshot(void) -{ - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(dev, ktime_get(), true); -} - -/** - * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) - */ -void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t next_event) -{ - newdev->event_handler = handler; - clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, true); -} - -/** - * tick_switch_to_oneshot - switch to oneshot mode - */ -int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) -{ - struct tick_device *td = &__get_cpu_var(tick_cpu_device); - struct clock_event_device *dev = td->evtdev; - - if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) { - - printk(KERN_INFO "Clockevents: " - "could not switch to one-shot mode:"); - if (!dev) { - printk(" no tick device\n"); - } else { - if (!tick_device_is_functional(dev)) - printk(" %s is not functional.\n", dev->name); - else - printk(" %s does not support one-shot mode.\n", - dev->name); - } - return -EINVAL; - } - - td->mode = TICKDEV_MODE_ONESHOT; - dev->event_handler = handler; - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_switch_to_oneshot(); - return 0; -} - -/** - * tick_check_oneshot_mode - check whether the system is in oneshot mode - * - * returns 1 when either nohz or highres are enabled. otherwise 0. - */ -int tick_oneshot_mode_active(void) -{ - unsigned long flags; - int ret; - - local_irq_save(flags); - ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; - local_irq_restore(flags); - - return ret; -} - -#ifdef CONFIG_HIGH_RES_TIMERS -/** - * tick_init_highres - switch to high resolution mode - * - * Called with interrupts disabled. - */ -int tick_init_highres(void) -{ - return tick_switch_to_oneshot(hrtimer_interrupt); -} -#endif -/* - * linux/kernel/time/tick-sched.c - * - * Copyright(C) 2005-2006, Thomas Gleixner - * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner - * - * No idle tick implementation for low and high resolution timers - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * Distribute under GPLv2. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "tick-internal.h" - -/* - * Per cpu nohz control structure - */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); - -/* - * The time, when the last jiffy update happened. Protected by xtime_lock. - */ -static ktime_t last_jiffies_update; - -struct tick_sched *tick_get_tick_sched(int cpu) -{ - return &per_cpu(tick_cpu_sched, cpu); -} - -/* - * Must be called with interrupts disabled ! - */ -static void tick_do_update_jiffies64(ktime_t now) -{ - unsigned long ticks = 0; - ktime_t delta; - - /* - * Do a quick check without holding xtime_lock: - */ - delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 < tick_period.tv64) - return; - - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); - - delta = ktime_sub(now, last_jiffies_update); - if (delta.tv64 >= tick_period.tv64) { - - delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); - - /* Slow path for long timeouts */ - if (unlikely(delta.tv64 >= tick_period.tv64)) { - s64 incr = ktime_to_ns(tick_period); - - ticks = ktime_divns(delta, incr); - - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); - } - do_timer(++ticks); - - /* Keep the tick_next_period variable up to date */ - tick_next_period = ktime_add(last_jiffies_update, tick_period); - } - write_sequnlock(&xtime_lock); -} - -/* - * Initialize and return retrieve the jiffies update. - */ -static ktime_t tick_init_jiffy_update(void) -{ - ktime_t period; - - write_seqlock(&xtime_lock); - /* Did we start the jiffies update yet ? */ - if (last_jiffies_update.tv64 == 0) - last_jiffies_update = tick_next_period; - period = last_jiffies_update; - write_sequnlock(&xtime_lock); - return period; -} - -/* - * NOHZ - aka dynamic tick functionality - */ -#ifdef CONFIG_NO_HZ -/* - * NO HZ enabled ? - */ -static int tick_nohz_enabled __read_mostly = 1; - -/* - * Enable / Disable tickless mode - */ -static int __init setup_tick_nohz(char *str) -{ - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; -} - -__setup("nohz=", setup_tick_nohz); - -/** - * tick_nohz_update_jiffies - update jiffies when idle was interrupted - * - * Called from interrupt entry when the CPU was idle - * - * In case the sched_tick was stopped on this CPU, we have to check if jiffies - * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. - */ -static void tick_nohz_update_jiffies(ktime_t now) -{ - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - unsigned long flags; - - ts->idle_waketime = now; - - local_irq_save(flags); - tick_do_update_jiffies64(now); - local_irq_restore(flags); - - touch_softlockup_watchdog(); -} - -/* - * Updates the per cpu time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) -{ - ktime_t delta; - - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } - - if (last_update_time) - *last_update_time = ktime_to_us(now); - -} - -static void tick_nohz_stop_idle(int cpu, ktime_t now) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - update_ts_time_stats(cpu, ts, now, NULL); - ts->idle_active = 0; - - sched_clock_idle_wakeup_event(0); -} - -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) -{ - ktime_t now; - - now = ktime_get(); - - update_ts_time_stats(cpu, ts, now, NULL); - - ts->idle_entrytime = now; - ts->idle_active = 1; - sched_clock_idle_sleep_event(); - return now; -} - -/** - * get_cpu_idle_time_us - get the total idle time of a cpu - * @cpu: CPU number to query - * @last_update_time: variable to store update time in. Do not update - * counters if NULL. - * - * Return the cummulative idle time (since boot) for a given - * CPU, in microseconds. - * - * This time is measured via accounting rather than sampling, - * and is as accurate as ktime_get() is. - * - * This function returns -1 if NOHZ is not enabled. - */ -u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_enabled) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); - -} -EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); - -/** - * get_cpu_iowait_time_us - get the total iowait time of a cpu - * @cpu: CPU number to query - * @last_update_time: variable to store update time in. Do not update - * counters if NULL. - * - * Return the cummulative iowait time (since boot) for a given - * CPU, in microseconds. - * - * This time is measured via accounting rather than sampling, - * and is as accurate as ktime_get() is. - * - * This function returns -1 if NOHZ is not enabled. - */ -u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - - if (!tick_nohz_enabled) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - - return ktime_to_us(iowait); -} -EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); - -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) -{ - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, now; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - u64 time_delta; - int cpu; - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - - now = tick_nohz_start_idle(cpu, ts); - - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - - if (need_resched()) - return; - - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - - ts->idle_calls++; - /* Read jiffies and the time when jiffies were updated last */ - do { - seq = read_seqbegin(&xtime_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; - time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); - - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; - } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - } - /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu - */ - if (!ts->tick_stopped && delta_jiffies == 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. Keep track of the fact that it was the one - * which had the do_timer() duty last. If this cpu is - * the one which had the do_timer() duty last, we - * limit the sleep time to the timekeeping - * max_deferement value which we retrieved - * above. Otherwise we can sleep as long as we want. - */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; - ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; - } - - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals - * that there is no timer pending or at least extremely - * far into the future (12 days for HZ=1000). In this - * case we set the expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); - else - expires.tv64 = KTIME_MAX; - - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) - goto out; - - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - select_nohz_load_balancer(1); - - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; - } - - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - - /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. - */ - if (unlikely(expires.tv64 == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); - goto out; - } - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); - } - raise_softirq_irqoff(TIMER_SOFTIRQ); -out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; - ts->sleep_length = ktime_sub(dev->next_event, now); -} - -/** - * tick_nohz_idle_enter - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: - * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. - */ -void tick_nohz_idle_enter(void) -{ - struct tick_sched *ts; - - WARN_ON_ONCE(irqs_disabled()); - - /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ - set_cpu_sd_state_idle(); - - local_irq_disable(); - - ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); - - local_irq_enable(); -} - -/** - * tick_nohz_irq_exit - update next tick event from interrupt exit - * - * When an interrupt fires while we are idle and it doesn't cause - * a reschedule, it may still add, modify or delete a timer, enqueue - * an RCU callback, etc... - * So we need to re-calculate and reprogram the next tick event. - */ -void tick_nohz_irq_exit(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - if (!ts->inidle) - return; - - tick_nohz_stop_sched_tick(ts); -} - -/** - * tick_nohz_get_sleep_length - return the length of the current sleep - * - * Called from power state control code with interrupts disabled - */ -ktime_t tick_nohz_get_sleep_length(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - return ts->sleep_length; -} - -static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) -{ - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event( - hrtimer_get_expires(&ts->sched_timer), 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } -} - -/** - * tick_nohz_idle_exit - restart the idle tick from the idle task - * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. - */ -void tick_nohz_idle_exit(void) -{ - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif - ktime_t now; - - local_irq_disable(); - - if (ts->idle_active || (ts->inidle && ts->tick_stopped)) - now = ktime_get(); - - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - - if (!ts->inidle || !ts->tick_stopped) { - ts->inidle = 0; - local_irq_enable(); - return; - } - - ts->inidle = 0; - - /* Update jiffies first */ - select_nohz_load_balancer(0); - tick_do_update_jiffies64(now); - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! - */ - ticks = jiffies - ts->idle_jiffies; - /* - * We might be one off. Do not randomly account a huge number of ticks! - */ - if (ticks && ticks < LONG_MAX) - account_idle_ticks(ticks); -#endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); - - local_irq_enable(); -} - -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); -} - -/* - * The nohz low res interrupt handler - */ -static void tick_nohz_handler(struct clock_event_device *dev) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); - ktime_t now = ktime_get(); - - dev->next_event.tv64 = KTIME_MAX; - - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - -/** - * tick_nohz_switch_to_nohz - switch to nohz mode - */ -static void tick_nohz_switch_to_nohz(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - ktime_t next; - - if (!tick_nohz_enabled) - return; - - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); - return; - } - - ts->nohz_mode = NOHZ_MODE_LOWRES; - - /* - * Recycle the hrtimer in ts, so we can share the - * hrtimer_forward with the highres code. - */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - /* Get the next period */ - next = tick_init_jiffy_update(); - - for (;;) { - hrtimer_set_expires(&ts->sched_timer, next); - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } - local_irq_enable(); -} - -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) -{ -#if 0 - /* Switch back to 2.6.27 behaviour */ - - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t delta; - - /* - * Do not touch the tick device, when the next expiry is either - * already reached or less/equal than the tick period. - */ - delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); - if (delta.tv64 <= tick_period.tv64) - return; - - tick_nohz_restart(ts, now); -#endif -} - -static inline void tick_check_nohz(int cpu) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now; - - if (!ts->idle_active && !ts->tick_stopped) - return; - now = ktime_get(); - if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); - if (ts->tick_stopped) { - tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(cpu, now); - } -} - -#else - -static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } - -#endif /* NO_HZ */ - -/* - * Called from irq_enter to notify about the possible interruption of idle() - */ -void tick_check_idle(int cpu) -{ - tick_check_oneshot_broadcast(cpu); - tick_check_nohz(cpu); -} - -/* - * High resolution timer specific code - */ -#ifdef CONFIG_HIGH_RES_TIMERS -/* - * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. - */ -static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) -{ - struct tick_sched *ts = - container_of(timer, struct tick_sched, sched_timer); - struct pt_regs *regs = get_irq_regs(); - ktime_t now = ktime_get(); - int cpu = smp_processor_id(); - -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); - - /* - * Do not call, when we are not in irq context and have - * no valid regs pointer - */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } - - hrtimer_forward(timer, now, tick_period); - - return HRTIMER_RESTART; -} - -/** - * tick_setup_sched_timer - setup the tick emulation timer - */ -void tick_setup_sched_timer(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - ktime_t now = ktime_get(); - - /* - * Emulate tick processing via per-CPU hrtimers: - */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ts->sched_timer.function = tick_sched_timer; - - /* Get the next period (per cpu) */ - hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } - -#ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) - ts->nohz_mode = NOHZ_MODE_HIGHRES; -#endif -} -#endif /* HIGH_RES_TIMERS */ - -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS -void tick_cancel_sched_timer(int cpu) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - -# ifdef CONFIG_HIGH_RES_TIMERS - if (ts->sched_timer.base) - hrtimer_cancel(&ts->sched_timer); -# endif - - ts->nohz_mode = NOHZ_MODE_INACTIVE; -} -#endif - -/** - * Async notification about clocksource changes - */ -void tick_clock_notify(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); -} - -/* - * Async notification about clock event changes - */ -void tick_oneshot_notify(void) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - set_bit(0, &ts->check_clocks); -} - -/** - * Check, if a change happened, which makes oneshot possible. - * - * Called cyclic from the hrtimer softirq (driven by the timer - * softirq) allow_nohz signals, that we can switch into low-res nohz - * mode, because high resolution timers are disabled (either compile - * or runtime). - */ -int tick_check_oneshot_change(int allow_nohz) -{ - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - - if (!test_and_clear_bit(0, &ts->check_clocks)) - return 0; - - if (ts->nohz_mode != NOHZ_MODE_INACTIVE) - return 0; - - if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) - return 0; - - if (!allow_nohz) - return 1; - - tick_nohz_switch_to_nohz(); - return 0; -} -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp) -{ - u64 nsec; - - nsec = source_tstamp + sync->offset; - nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / - TIMECOMPARE_SKEW_RESOLUTION; - - return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp) -{ - u64 start_source = 0, end_source = 0; - struct { - s64 offset; - s64 duration_target; - } buffer[10], sample, *samples; - int counter = 0, i; - int used; - int index; - int num_samples = sync->num_samples; - - if (num_samples > ARRAY_SIZE(buffer)) { - samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); - if (!samples) { - samples = buffer; - num_samples = ARRAY_SIZE(buffer); - } - } else { - samples = buffer; - } - - /* run until we have enough valid samples, but do not try forever */ - i = 0; - counter = 0; - while (1) { - u64 ts; - ktime_t start, end; - - start = sync->target(); - ts = timecounter_read(sync->source); - end = sync->target(); - - if (!i) - start_source = ts; - - /* ignore negative durations */ - sample.duration_target = ktime_to_ns(ktime_sub(end, start)); - if (sample.duration_target >= 0) { - /* - * assume symetric delay to and from source: - * average target time corresponds to measured - * source time - */ - sample.offset = - (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - - ts; - - /* simple insertion sort based on duration */ - index = counter - 1; - while (index >= 0) { - if (samples[index].duration_target < - sample.duration_target) - break; - samples[index + 1] = samples[index]; - index--; - } - samples[index + 1] = sample; - counter++; - } - - i++; - if (counter >= num_samples || i >= 100000) { - end_source = ts; - break; - } - } - - *source_tstamp = (end_source + start_source) / 2; - - /* remove outliers by only using 75% of the samples */ - used = counter * 3 / 4; - if (!used) - used = counter; - if (used) { - /* calculate average */ - s64 off = 0; - for (index = 0; index < used; index++) - off += samples[index].offset; - *offset = div_s64(off, used); - } - - if (samples && samples != buffer) - kfree(samples); - - return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - s64 offset; - u64 average_time; - - if (!timecompare_offset(sync, &offset, &average_time)) - return; - - if (!sync->last_update) { - sync->last_update = average_time; - sync->offset = offset; - sync->skew = 0; - } else { - s64 delta_nsec = average_time - sync->last_update; - - /* avoid division by negative or small deltas */ - if (delta_nsec >= 10000) { - s64 delta_offset_nsec = offset - sync->offset; - s64 skew; /* delta_offset_nsec * - TIMECOMPARE_SKEW_RESOLUTION / - delta_nsec */ - u64 divisor; - - /* div_s64() is limited to 32 bit divisor */ - skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; - divisor = delta_nsec; - while (unlikely(divisor >= ((s64)1) << 32)) { - /* divide both by 2; beware, right shift - of negative value has undefined - behavior and can only be used for - the positive divisor */ - skew = div_s64(skew, 2); - divisor >>= 1; - } - skew = div_s64(skew, divisor); - - /* - * Calculate new overall skew as 4/16 the - * old value and 12/16 the new one. This is - * a rather arbitrary tradeoff between - * only using the latest measurement (0/16 and - * 16/16) and even more weight on past measurements. - */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 - sync->skew = - div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * - sync->skew + - TIMECOMPARE_NEW_SKEW_PER_16 * skew, - 16); - sync->last_update = average_time; - sync->offset = offset; - } - } -} -EXPORT_SYMBOL_GPL(__timecompare_update); -/* - * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. - * This file is part of the GNU C Library. - * Contributed by Paul Eggert (eggert@twinsun.com). - * - * The GNU C Library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * The GNU C Library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the GNU C Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* - * Converts the calendar time to broken-down time representation - * Based on code from glibc-2.6 - * - * 2009-7-14: - * Moved from glibc-2.6 to kernel by Zhaolei - */ - -#include -#include - -/* - * Nonzero if YEAR is a leap year (every 4 years, - * except every 100th isn't, and every 400th is). - */ -static int __isleap(long year) -{ - return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); -} - -/* do a mathdiv for long type */ -static long math_div(long a, long b) -{ - return a / b - (a % b < 0); -} - -/* How many leap years between y1 and y2, y1 must less or equal to y2 */ -static long leaps_between(long y1, long y2) -{ - long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) - + math_div(y1 - 1, 400); - long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) - + math_div(y2 - 1, 400); - return leaps2 - leaps1; -} - -/* How many days come before each month (0-12). */ -static const unsigned short __mon_yday[2][13] = { - /* Normal years. */ - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, - /* Leap years. */ - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} -}; - -#define SECS_PER_HOUR (60 * 60) -#define SECS_PER_DAY (SECS_PER_HOUR * 24) - -/** - * time_to_tm - converts the calendar time to local broken-down time - * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, - * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time - */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) -{ - long days, rem, y; - const unsigned short *ip; - - days = totalsecs / SECS_PER_DAY; - rem = totalsecs % SECS_PER_DAY; - rem += offset; - while (rem < 0) { - rem += SECS_PER_DAY; - --days; - } - while (rem >= SECS_PER_DAY) { - rem -= SECS_PER_DAY; - ++days; - } - - result->tm_hour = rem / SECS_PER_HOUR; - rem %= SECS_PER_HOUR; - result->tm_min = rem / 60; - result->tm_sec = rem % 60; - - /* January 1, 1970 was a Thursday. */ - result->tm_wday = (4 + days) % 7; - if (result->tm_wday < 0) - result->tm_wday += 7; - - y = 1970; - - while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { - /* Guess a corrected year, assuming 365 days per year. */ - long yg = y + math_div(days, 365); - - /* Adjust DAYS and Y to match the guessed year. */ - days -= (yg - y) * 365 + leaps_between(y, yg); - y = yg; - } - - result->tm_year = y - 1900; - - result->tm_yday = days; - - ip = __mon_yday[__isleap(y)]; - for (y = 11; days < ip[y]; y--) - continue; - days -= ip[y]; - - result->tm_mon = y; - result->tm_mday = days + 1; -} -EXPORT_SYMBOL(time_to_tm); -/* - * linux/kernel/time/timekeeping.c - * - * Kernel timekeeping code and accessor functions - * - * This code was moved from linux/kernel/timer.c. - * Please see that file for copyright and history logs. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* The shift value of the current clocksource. */ - int shift; - - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ - u64 xtime_nsec; - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - int ntp_error_shift; - /* NTP adjusted clock multiplier */ - u32 mult; -}; - -static struct timekeeper timekeeper; - -/** - * timekeeper_setup_internals - Set up internals to use clocksource clock. - * - * @clock: Pointer to clocksource. - * - * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment - * pair and interval request. - * - * Unless you're the timekeeping code, you should not be using this! - */ -static void timekeeper_setup_internals(struct clocksource *clock) -{ - cycle_t interval; - u64 tmp, ntpinterval; - - timekeeper.clock = clock; - clock->cycle_last = clock->read(clock); - - /* Do the ns -> cycle conversion first, using original mult */ - tmp = NTP_INTERVAL_LENGTH; - tmp <<= clock->shift; - ntpinterval = tmp; - tmp += clock->mult/2; - do_div(tmp, clock->mult); - if (tmp == 0) - tmp = 1; - - interval = (cycle_t) tmp; - timekeeper.cycle_interval = interval; - - /* Go back from cycles -> shifted ns */ - timekeeper.xtime_interval = (u64) interval * clock->mult; - timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; - timekeeper.raw_interval = - ((u64) interval * clock->mult) >> clock->shift; - - timekeeper.xtime_nsec = 0; - timekeeper.shift = clock->shift; - - timekeeper.ntp_error = 0; - timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; - - /* - * The timekeeper keeps its own mult values for the currently - * active clocksource. These value will be adjusted via NTP - * to counteract clock drifting. - */ - timekeeper.mult = clock->mult; -} - -/* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) -{ - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); -} - -static inline s64 timekeeping_get_ns_raw(void) -{ - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; - - /* read clocksource: */ - clock = timekeeper.clock; - cycle_now = clock->read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* return delta convert to nanoseconds. */ - return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); -} - -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ -static struct timespec xtime __attribute__ ((aligned (16))); -static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; - -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. - */ -static struct timespec raw_time; - -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - -/* must hold xtime_lock */ -void timekeeping_leap_insert(int leapsecond) -{ - xtime.tv_sec += leapsecond; - wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); -} - -/** - * timekeeping_forward_now - update clock to the current time - * - * Forward the current clock to update its state since the last call to - * update_wall_time(). This is useful before significant clock changes, - * as it avoids having to deal with this time offset explicitly. - */ -static void timekeeping_forward_now(void) -{ - cycle_t cycle_now, cycle_delta; - struct clocksource *clock; - s64 nsec; - - clock = timekeeper.clock; - cycle_now = clock->read(clock); - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; - - nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); - - /* If arch requires, add in gettimeoffset() */ - nsec += arch_gettimeoffset(); - - timespec_add_ns(&xtime, nsec); - - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&raw_time, nsec); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - nsecs = timekeeping_get_ns(); - - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -EXPORT_SYMBOL(getnstimeofday); - -ktime_t ktime_get(void) -{ - unsigned int seq; - s64 secs, nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqbegin(&xtime_lock); - secs = xtime.tv_sec + wall_to_monotonic.tv_sec; - nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; - nsecs += timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - /* - * Use ktime_set/ktime_add_ns to create a proper ktime on - * 32-bit architectures without CONFIG_KTIME_SCALAR. - */ - return ktime_add_ns(ktime_set(secs, 0), nsecs); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned int seq; - s64 nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqbegin(&xtime_lock); - *ts = xtime; - tomono = wall_to_monotonic; - nsecs = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec + nsecs); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -#ifdef CONFIG_NTP_PPS - -/** - * getnstime_raw_and_real - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. - */ -void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) -{ - unsigned long seq; - s64 nsecs_raw, nsecs_real; - - WARN_ON_ONCE(timekeeping_suspended); - - do { - u32 arch_offset; - - seq = read_seqbegin(&xtime_lock); - - *ts_raw = raw_time; - *ts_real = xtime; - - nsecs_raw = timekeeping_get_ns_raw(); - nsecs_real = timekeeping_get_ns(); - - /* If arch requires, add in gettimeoffset() */ - arch_offset = arch_gettimeoffset(); - nsecs_raw += arch_offset; - nsecs_real += arch_offset; - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts_raw, nsecs_raw); - timespec_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(getnstime_raw_and_real); - -#endif /* CONFIG_NTP_PPS */ - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - getnstimeofday(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(const struct timespec *tv) -{ - struct timespec ts_delta; - unsigned long flags; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - timekeeping_forward_now(); - - ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - - xtime = *tv; - - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - - -/** - * timekeeping_inject_offset - Adds or subtracts from the current time. - * @tv: pointer to the timespec variable containing the offset - * - * Adds or subtracts an offset value from the current time. - */ -int timekeeping_inject_offset(struct timespec *ts) -{ - unsigned long flags; - - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - timekeeping_forward_now(); - - xtime = timespec_add(xtime, *ts); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); - - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} -EXPORT_SYMBOL(timekeeping_inject_offset); - -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static int change_clocksource(void *data) -{ - struct clocksource *new, *old; - - new = (struct clocksource *) data; - - timekeeping_forward_now(); - if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - timekeeper_setup_internals(new); - if (old->disable) - old->disable(old); - } - return 0; -} - -/** - * timekeeping_notify - Install a new clock source - * @clock: pointer to the clock source - * - * This function is called from clocksource.c after a new, better clock - * source has been registered. The caller holds the clocksource_mutex. - */ -void timekeeping_notify(struct clocksource *clock) -{ - if (timekeeper.clock == clock) - return; - stop_machine(change_clocksource, clock, NULL); - tick_clock_notify(); -} - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get_real); - -/** - * getrawmonotonic - Returns the raw monotonic time in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the raw monotonic time (completely un-modified by ntp) - */ -void getrawmonotonic(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - do { - seq = read_seqbegin(&xtime_lock); - nsecs = timekeeping_get_ns_raw(); - *ts = raw_time; - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} -EXPORT_SYMBOL(getrawmonotonic); - - -/** - * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres - */ -int timekeeping_valid_for_hres(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; -} - -/** - * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change! - */ -u64 timekeeping_max_deferment(void) -{ - return timekeeper.clock->max_idle_ns; -} - -/** - * read_persistent_clock - Return time from the persistent clock. - * - * Weak dummy function for arches that do not yet support it. - * Reads the time from the battery backed persistent clock. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) -{ - ts->tv_sec = 0; - ts->tv_nsec = 0; -} - -/** - * read_boot_clock - Return time of the system start. - * - * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) -{ - ts->tv_sec = 0; - ts->tv_nsec = 0; -} - -/* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - struct clocksource *clock; - unsigned long flags; - struct timespec now, boot; - - read_persistent_clock(&now); - read_boot_clock(&boot); - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_init(); - - clock = clocksource_default_clock(); - if (clock->enable) - clock->enable(clock); - timekeeper_setup_internals(clock); - - xtime.tv_sec = now.tv_sec; - xtime.tv_nsec = now.tv_nsec; - raw_time.tv_sec = 0; - raw_time.tv_nsec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = xtime.tv_sec; - boot.tv_nsec = xtime.tv_nsec; - } - set_normalized_timespec(&wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - total_sleep_time.tv_sec = 0; - total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); -} - -/* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; - -/** - * __timekeeping_inject_sleeptime - Internal function to add sleep interval - * @delta: pointer to a timespec delta value - * - * Takes a timespec offset measuring a suspend interval and properly - * adds the sleep offset to the timekeeping variables. - */ -static void __timekeeping_inject_sleeptime(struct timespec *delta) -{ - if (!timespec_valid(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); - return; - } - - xtime = timespec_add(xtime, *delta); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); - total_sleep_time = timespec_add(total_sleep_time, *delta); -} - - -/** - * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values - * @delta: pointer to a timespec delta value - * - * This hook is for architectures that cannot support read_persistent_clock - * because their RTC/persistent clock is only accessible when irqs are enabled. - * - * This function should only be called by rtc_resume(), and allows - * a suspend offset to be injected into the timekeeping values. - */ -void timekeeping_inject_sleeptime(struct timespec *delta) -{ - unsigned long flags; - struct timespec ts; - - /* Make sure we don't set the clock twice */ - read_persistent_clock(&ts); - if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) - return; - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_forward_now(); - - __timekeeping_inject_sleeptime(delta); - - timekeeper.ntp_error = 0; - ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); -} - - -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. - */ -static void timekeeping_resume(void) -{ - unsigned long flags; - struct timespec ts; - - read_persistent_clock(&ts); - - clocksource_resume(); - - write_seqlock_irqsave(&xtime_lock, flags); - - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&ts); - } - /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - - touch_softlockup_watchdog(); - - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ - hrtimers_resume(); -} - -static int timekeeping_suspend(void) -{ - unsigned long flags; - struct timespec delta, delta_delta; - static struct timespec old_delta; - - read_persistent_clock(&timekeeping_suspend_time); - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_forward_now(); - timekeeping_suspended = 1; - - /* - * To avoid drift caused by repeated suspend/resumes, - * which each can add ~1 second drift error, - * try to compensate so the difference in system time - * and persistent_clock time stays close to constant. - */ - delta = timespec_sub(xtime, timekeeping_suspend_time); - delta_delta = timespec_sub(delta, old_delta); - if (abs(delta_delta.tv_sec) >= 2) { - /* - * if delta_delta is too large, assume time correction - * has occured and set old_delta to the current delta. - */ - old_delta = delta; - } else { - /* Otherwise try to adjust old_system to compensate */ - timekeeping_suspend_time = - timespec_add(timekeeping_suspend_time, delta_delta); - } - write_sequnlock_irqrestore(&xtime_lock, flags); - - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - clocksource_suspend(); - - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, -}; - -static int __init timekeeping_init_ops(void) -{ - register_syscore_ops(&timekeeping_syscore_ops); - return 0; -} - -device_initcall(timekeeping_init_ops); - -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adjusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); - tick_error -= timekeeper.xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void timekeeping_adjust(s64 offset) -{ - s64 error, interval = timekeeper.cycle_interval; - int adj; - - /* - * The point of this is to check if the error is greater then half - * an interval. - * - * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. - * - * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) interval twice, but keeps the - * (error > interval) comparison as still measuring if error is - * larger then half an interval. - * - * Note: It does not "save" on aggravation when reading the code. - */ - error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); - if (error > interval) { - /* - * We now divide error by 4(via shift), which checks if - * the error is greater then twice the interval. - * If it is greater, we need a bigadjust, if its smaller, - * we can adjust by 1. - */ - error >>= 2; - /* - * XXX - In update_wall_time, we round up to the next - * nanosecond, and store the amount rounded up into - * the error. This causes the likely below to be unlikely. - * - * The proper fix is to avoid rounding up by using - * the high precision timekeeper.xtime_nsec instead of - * xtime.tv_nsec everywhere. Fixing this will take some - * time. - */ - if (likely(error <= interval)) - adj = 1; - else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else /* No adjustment needed */ - return; - - WARN_ONCE(timekeeper.clock->maxadj && - (timekeeper.mult + adj > timekeeper.clock->mult + - timekeeper.clock->maxadj), - "Adjusting %s more then 11%% (%ld vs %ld)\n", - timekeeper.clock->name, (long)timekeeper.mult + adj, - (long)timekeeper.clock->mult + - timekeeper.clock->maxadj); - /* - * So the following can be confusing. - * - * To keep things simple, lets assume adj == 1 for now. - * - * When adj != 1, remember that the interval and offset values - * have been appropriately scaled so the math is the same. - * - * The basic idea here is that we're increasing the multiplier - * by one, this causes the xtime_interval to be incremented by - * one cycle_interval. This is because: - * xtime_interval = cycle_interval * mult - * So if mult is being incremented by one: - * xtime_interval = cycle_interval * (mult + 1) - * Its the same as: - * xtime_interval = (cycle_interval * mult) + cycle_interval - * Which can be shortened to: - * xtime_interval += cycle_interval - * - * So offset stores the non-accumulated cycles. Thus the current - * time (in shifted nanoseconds) is: - * now = (offset * adj) + xtime_nsec - * Now, even though we're adjusting the clock frequency, we have - * to keep time consistent. In other words, we can't jump back - * in time, and we also want to avoid jumping forward in time. - * - * So given the same offset value, we need the time to be the same - * both before and after the freq adjustment. - * now = (offset * adj_1) + xtime_nsec_1 - * now = (offset * adj_2) + xtime_nsec_2 - * So: - * (offset * adj_1) + xtime_nsec_1 = - * (offset * adj_2) + xtime_nsec_2 - * And we know: - * adj_2 = adj_1 + 1 - * So: - * (offset * adj_1) + xtime_nsec_1 = - * (offset * (adj_1+1)) + xtime_nsec_2 - * (offset * adj_1) + xtime_nsec_1 = - * (offset * adj_1) + offset + xtime_nsec_2 - * Canceling the sides: - * xtime_nsec_1 = offset + xtime_nsec_2 - * Which gives us: - * xtime_nsec_2 = xtime_nsec_1 - offset - * Which simplfies to: - * xtime_nsec -= offset - * - * XXX - TODO: Doc ntp_error calculation. - */ - timekeeper.mult += adj; - timekeeper.xtime_interval += interval; - timekeeper.xtime_nsec -= offset; - timekeeper.ntp_error -= (interval - offset) << - timekeeper.ntp_error_shift; -} - - -/** - * logarithmic_accumulation - shifted accumulation of cycles - * - * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation - * loop. - * - * Returns the unconsumed cycles. - */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) -{ - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; - u64 raw_nsecs; - - /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < timekeeper.cycle_interval<cycle_last += timekeeper.cycle_interval << shift; - - timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - while (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - /* Accumulate raw time */ - raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - raw_time.tv_sec += raw_secs; - } - raw_time.tv_nsec = raw_nsecs; - - /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length << shift; - timekeeper.ntp_error -= - (timekeeper.xtime_interval + timekeeper.xtime_remainder) << - (timekeeper.ntp_error_shift + shift); - - return offset; -} - - -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. - */ -static void update_wall_time(void) -{ - struct clocksource *clock; - cycle_t offset; - int shift = 0, maxshift; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - - clock = timekeeper.clock; - -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = timekeeper.cycle_interval; -#else - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; -#endif - timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; - - /* - * With NO_HZ we may have to accumulate many cycle_intervals - * (think "ticks") worth of time at once. To do this efficiently, - * we calculate the largest doubling multiple of cycle_intervals - * that is smaller then the offset. We then accumulate that - * chunk in one go, and then try to consume the next smaller - * doubled multiple. - */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); - shift = max(0, shift); - /* Bound shift to one less then what overflows tick_length */ - maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; - shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(offset, shift); - if(offset < timekeeper.cycle_interval<> timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - timekeeper.ntp_error_shift; - - /* - * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger then NSEC_PER_SEC - */ - if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { - xtime.tv_nsec -= NSEC_PER_SEC; - xtime.tv_sec++; - second_overflow(); - } - - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); -} - -/** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set - * - * Returns the wall-time of boot in a timespec. - * - * This is based on the wall_to_monotonic offset and the total suspend - * time. Calls to settimeofday will affect the value returned (which - * basically means that however wrong your real time clock is at boot time, - * you get the right time here). - */ -void getboottime(struct timespec *ts) -{ - struct timespec boottime = { - .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, - .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec - }; - - set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); -} -EXPORT_SYMBOL_GPL(getboottime); - - -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts: pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ - struct timespec tomono, sleep; - unsigned int seq; - s64 nsecs; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqbegin(&xtime_lock); - *ts = xtime; - tomono = wall_to_monotonic; - sleep = total_sleep_time; - nsecs = timekeeping_get_ns(); - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - -/** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ - struct timespec ts; - - get_monotonic_boottime(&ts); - return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts: pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ - *ts = timespec_add(*ts, total_sleep_time); -} -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); - -unsigned long get_seconds(void) -{ - return xtime.tv_sec; -} -EXPORT_SYMBOL(get_seconds); - -struct timespec __current_kernel_time(void) -{ - return xtime; -} - -struct timespec current_kernel_time(void) -{ - struct timespec now; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - - now = xtime; - } while (read_seqretry(&xtime_lock, seq)); - - return now; -} -EXPORT_SYMBOL(current_kernel_time); - -struct timespec get_monotonic_coarse(void) -{ - struct timespec now, mono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - - now = xtime; - mono = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); - return now; -} - -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - -/** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - * and sleep offsets. - * @xtim: pointer to timespec to be set with xtime - * @wtom: pointer to timespec to be set with wall_to_monotonic - * @sleep: pointer to timespec to be set with time in suspend - */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, - struct timespec *wtom, struct timespec *sleep) -{ - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - *xtim = xtime; - *wtom = wall_to_monotonic; - *sleep = total_sleep_time; - } while (read_seqretry(&xtime_lock, seq)); -} - -/** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ - unsigned long seq; - struct timespec wtom; - - do { - seq = read_seqbegin(&xtime_lock); - wtom = wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); - return timespec_to_ktime(wtom); -} - -/** - * xtime_update() - advances the timekeeping infrastructure - * @ticks: number of ticks, that have elapsed since the last call. - * - * Must be called with interrupts disabled. - */ -void xtime_update(unsigned long ticks) -{ - write_seqlock(&xtime_lock); - do_timer(ticks); - write_sequnlock(&xtime_lock); -} -/* - * kernel/time/timer_list.c - * - * List pending timers - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); - -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - -/* - * This allows printing both to /proc/timer_list and - * to the console (on SysRq-Q): - */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) - -static void print_name_offset(struct seq_file *m, void *sym) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%pK>", sym); - else - SEQ_printf(m, "%s", symname); -} - -static void -print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, - int idx, u64 now) -{ -#ifdef CONFIG_TIMER_STATS - char tmp[TASK_COMM_LEN + 1]; -#endif - SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, taddr); - SEQ_printf(m, ", "); - print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); -#ifdef CONFIG_TIMER_STATS - SEQ_printf(m, ", "); - print_name_offset(m, timer->start_site); - memcpy(tmp, timer->start_comm, TASK_COMM_LEN); - tmp[TASK_COMM_LEN] = 0; - SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); -#endif - SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", - (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), - (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), - (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), - (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); -} - -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) -{ - struct hrtimer *timer, tmp; - unsigned long next = 0, i; - struct timerqueue_node *curr; - unsigned long flags; - -next_one: - i = 0; - raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - - curr = timerqueue_getnext(&base->active); - /* - * Crude but we have to do this O(N*N) thing, because - * we have to unlock the base when printing: - */ - while (curr && i < next) { - curr = timerqueue_iterate_next(curr); - i++; - } - - if (curr) { - - timer = container_of(curr, struct hrtimer, node); - tmp = *timer; - raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); - - print_timer(m, timer, &tmp, i, now); - next++; - goto next_one; - } - raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); -} - -static void -print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) -{ - SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); - SEQ_printf(m, " .get_time: "); - print_name_offset(m, base->get_time); - SEQ_printf(m, "\n"); -#ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Lu nsecs\n", - (unsigned long long) ktime_to_ns(base->offset)); -#endif - SEQ_printf(m, "active timers:\n"); - print_active_timers(m, base, now); -} - -static void print_cpu(struct seq_file *m, int cpu, u64 now) -{ - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - - SEQ_printf(m, "\n"); - SEQ_printf(m, "cpu: %d\n", cpu); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - SEQ_printf(m, " clock %d:\n", i); - print_base(m, cpu_base->clock_base + i, now); - } -#define P(x) \ - SEQ_printf(m, " .%-15s: %Lu\n", #x, \ - (unsigned long long)(cpu_base->x)) -#define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ - (unsigned long long)(ktime_to_ns(cpu_base->x))) - -#ifdef CONFIG_HIGH_RES_TIMERS - P_ns(expires_next); - P(hres_active); - P(nr_events); - P(nr_retries); - P(nr_hangs); - P_ns(max_hang_time); -#endif -#undef P -#undef P_ns - -#ifdef CONFIG_TICK_ONESHOT -# define P(x) \ - SEQ_printf(m, " .%-15s: %Lu\n", #x, \ - (unsigned long long)(ts->x)) -# define P_ns(x) \ - SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ - (unsigned long long)(ktime_to_ns(ts->x))) - { - struct tick_sched *ts = tick_get_tick_sched(cpu); - P(nohz_mode); - P_ns(idle_tick); - P(tick_stopped); - P(idle_jiffies); - P(idle_calls); - P(idle_sleeps); - P_ns(idle_entrytime); - P_ns(idle_waketime); - P_ns(idle_exittime); - P_ns(idle_sleeptime); - P_ns(iowait_sleeptime); - P(last_jiffies); - P(next_jiffies); - P_ns(idle_expires); - SEQ_printf(m, "jiffies: %Lu\n", - (unsigned long long)jiffies); - } -#endif - -#undef P -#undef P_ns -} - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -static void -print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) -{ - struct clock_event_device *dev = td->evtdev; - - SEQ_printf(m, "\n"); - SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); - if (cpu < 0) - SEQ_printf(m, "Broadcast device\n"); - else - SEQ_printf(m, "Per CPU device: %d\n", cpu); - - SEQ_printf(m, "Clock Event Device: "); - if (!dev) { - SEQ_printf(m, "\n"); - return; - } - SEQ_printf(m, "%s\n", dev->name); - SEQ_printf(m, " max_delta_ns: %llu\n", - (unsigned long long) dev->max_delta_ns); - SEQ_printf(m, " min_delta_ns: %llu\n", - (unsigned long long) dev->min_delta_ns); - SEQ_printf(m, " mult: %u\n", dev->mult); - SEQ_printf(m, " shift: %u\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); - SEQ_printf(m, " next_event: %Ld nsecs\n", - (unsigned long long) ktime_to_ns(dev->next_event)); - - SEQ_printf(m, " set_next_event: "); - print_name_offset(m, dev->set_next_event); - SEQ_printf(m, "\n"); - - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); - SEQ_printf(m, "\n"); - - SEQ_printf(m, " event_handler: "); - print_name_offset(m, dev->event_handler); - SEQ_printf(m, "\n"); - SEQ_printf(m, " retries: %lu\n", dev->retries); -} - -static void timer_list_show_tickdevices(struct seq_file *m) -{ - int cpu; - -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device(), -1); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_mask())[0]); -#ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); -#endif - SEQ_printf(m, "\n"); -#endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); -} -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } -#endif - -static int timer_list_show(struct seq_file *m, void *v) -{ - u64 now = ktime_to_ns(ktime_get()); - int cpu; - - SEQ_printf(m, "Timer List Version: v0.6\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); - - for_each_online_cpu(cpu) - print_cpu(m, cpu, now); - - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); - - return 0; -} - -void sysrq_timer_list_show(void) -{ - timer_list_show(NULL, NULL); -} - -static int timer_list_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, timer_list_show, NULL); -} - -static const struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init init_timer_list_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_timer_list_procfs); -/* - * kernel/time/timer_stats.c - * - * Collect timer usage statistics. - * - * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar - * Copyright(C) 2006 Timesys Corp., Thomas Gleixner - * - * timer_stats is based on timer_top, a similar functionality which was part of - * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the - * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based - * on dynamic allocation of the statistics entries and linear search based - * lookup combined with a global lock, rather than the static array, hash - * and per-CPU locking which is used by timer_stats. It was written for the - * pre hrtimer kernel code and therefore did not take hrtimers into account. - * Nevertheless it provided the base for the timer_stats implementation and - * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks - * for this effort. - * - * timer_top.c is - * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus - * Written by Daniel Petrini - * timer_top.c was released under the GNU General Public License version 2 - * - * We export the addresses and counting of timer functions being called, - * the pid and cmdline from the owner process if applicable. - * - * Start/stop data collection: - * # echo [1|0] >/proc/timer_stats - * - * Display the information collected so far: - * # cat /proc/timer_stats - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * This is our basic unit of interest: a timer expiry event identified - * by the timer, its start/expire functions and the PID of the task that - * started the timer. We count the number of times an event happens: - */ -struct entry { - /* - * Hash list: - */ - struct entry *next; - - /* - * Hash keys: - */ - void *timer; - void *start_func; - void *expire_func; - pid_t pid; - - /* - * Number of timeout events: - */ - unsigned long count; - unsigned int timer_flag; - - /* - * We save the command-line string to preserve - * this information past task exit: - */ - char comm[TASK_COMM_LEN + 1]; - -} ____cacheline_aligned_in_smp; - -/* - * Spinlock protecting the tables - not taken during lookup: - */ -static DEFINE_RAW_SPINLOCK(table_lock); - -/* - * Per-CPU lookup locks for fast hash lookup: - */ -static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); - -/* - * Mutex to serialize state changes with show-stats activities: - */ -static DEFINE_MUTEX(show_mutex); - -/* - * Collection status, active/inactive: - */ -int __read_mostly timer_stats_active; - -/* - * Beginning/end timestamps of measurement: - */ -static ktime_t time_start, time_stop; - -/* - * tstat entry structs only get allocated while collection is - * active and never freed during that time - this simplifies - * things quite a bit. - * - * They get freed when a new collection period is started. - */ -#define MAX_ENTRIES_BITS 10 -#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) - -static unsigned long nr_entries; -static struct entry entries[MAX_ENTRIES]; - -static atomic_t overflow_count; - -/* - * The entries are in a hash-table, for fast lookup: - */ -#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) -#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) -#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) - -#define __tstat_hashfn(entry) \ - (((unsigned long)(entry)->timer ^ \ - (unsigned long)(entry)->start_func ^ \ - (unsigned long)(entry)->expire_func ^ \ - (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) - -#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) - -static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; - -static void reset_entries(void) -{ - nr_entries = 0; - memset(entries, 0, sizeof(entries)); - memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); - atomic_set(&overflow_count, 0); -} - -static struct entry *alloc_entry(void) -{ - if (nr_entries >= MAX_ENTRIES) - return NULL; - - return entries + nr_entries++; -} - -static int match_entries(struct entry *entry1, struct entry *entry2) -{ - return entry1->timer == entry2->timer && - entry1->start_func == entry2->start_func && - entry1->expire_func == entry2->expire_func && - entry1->pid == entry2->pid; -} - -/* - * Look up whether an entry matching this item is present - * in the hash already. Must be called with irqs off and the - * lookup lock held: - */ -static struct entry *tstat_lookup(struct entry *entry, char *comm) -{ - struct entry **head, *curr, *prev; - - head = tstat_hashentry(entry); - curr = *head; - - /* - * The fastpath is when the entry is already hashed, - * we do this with the lookup lock held, but with the - * table lock not held: - */ - while (curr) { - if (match_entries(curr, entry)) - return curr; - - curr = curr->next; - } - /* - * Slowpath: allocate, set up and link a new hash entry: - */ - prev = NULL; - curr = *head; - - raw_spin_lock(&table_lock); - /* - * Make sure we have not raced with another CPU: - */ - while (curr) { - if (match_entries(curr, entry)) - goto out_unlock; - - prev = curr; - curr = curr->next; - } - - curr = alloc_entry(); - if (curr) { - *curr = *entry; - curr->count = 0; - curr->next = NULL; - memcpy(curr->comm, comm, TASK_COMM_LEN); - - smp_mb(); /* Ensure that curr is initialized before insert */ - - if (prev) - prev->next = curr; - else - *head = curr; - } - out_unlock: - raw_spin_unlock(&table_lock); - - return curr; -} - -/** - * timer_stats_update_stats - Update the statistics for a timer. - * @timer: pointer to either a timer_list or a hrtimer - * @pid: the pid of the task which set up the timer - * @startf: pointer to the function which did the timer setup - * @timerf: pointer to the timer callback function of the timer - * @comm: name of the process which set up the timer - * - * When the timer is already registered, then the event counter is - * incremented. Otherwise the timer is registered in a free slot. - */ -void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) -{ - /* - * It doesn't matter which lock we take: - */ - raw_spinlock_t *lock; - struct entry *entry, input; - unsigned long flags; - - if (likely(!timer_stats_active)) - return; - - lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); - - input.timer = timer; - input.start_func = startf; - input.expire_func = timerf; - input.pid = pid; - input.timer_flag = timer_flag; - - raw_spin_lock_irqsave(lock, flags); - if (!timer_stats_active) - goto out_unlock; - - entry = tstat_lookup(&input, comm); - if (likely(entry)) - entry->count++; - else - atomic_inc(&overflow_count); - - out_unlock: - raw_spin_unlock_irqrestore(lock, flags); -} - -static void print_name_offset(struct seq_file *m, unsigned long addr) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name(addr, symname) < 0) - seq_printf(m, "<%p>", (void *)addr); - else - seq_printf(m, "%s", symname); -} - -static int tstats_show(struct seq_file *m, void *v) -{ - struct timespec period; - struct entry *entry; - unsigned long ms; - long events = 0; - ktime_t time; - int i; - - mutex_lock(&show_mutex); - /* - * If still active then calculate up to now: - */ - if (timer_stats_active) - time_stop = ktime_get(); - - time = ktime_sub(time_stop, time_start); - - period = ktime_to_timespec(time); - ms = period.tv_nsec / 1000000; - - seq_puts(m, "Timer Stats Version: v0.2\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); - if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); - - for (i = 0; i < nr_entries; i++) { - entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { - seq_printf(m, "%4luD, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } else { - seq_printf(m, " %4lu, %5d %-16s ", - entry->count, entry->pid, entry->comm); - } - - print_name_offset(m, (unsigned long)entry->start_func); - seq_puts(m, " ("); - print_name_offset(m, (unsigned long)entry->expire_func); - seq_puts(m, ")\n"); - - events += entry->count; - } - - ms += period.tv_sec * 1000; - if (!ms) - ms = 1; - - if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", - events, events * 1000 / ms, - (events * 1000000 / ms) % 1000); - else - seq_printf(m, "%ld total events\n", events); - - mutex_unlock(&show_mutex); - - return 0; -} - -/* - * After a state change, make sure all concurrent lookup/update - * activities have stopped: - */ -static void sync_access(void) -{ - unsigned long flags; - int cpu; - - for_each_online_cpu(cpu) { - raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); - - raw_spin_lock_irqsave(lock, flags); - /* nothing */ - raw_spin_unlock_irqrestore(lock, flags); - } -} - -static ssize_t tstats_write(struct file *file, const char __user *buf, - size_t count, loff_t *offs) -{ - char ctl[2]; - - if (count != 2 || *offs) - return -EINVAL; - - if (copy_from_user(ctl, buf, count)) - return -EFAULT; - - mutex_lock(&show_mutex); - switch (ctl[0]) { - case '0': - if (timer_stats_active) { - timer_stats_active = 0; - time_stop = ktime_get(); - sync_access(); - } - break; - case '1': - if (!timer_stats_active) { - reset_entries(); - time_start = ktime_get(); - smp_mb(); - timer_stats_active = 1; - } - break; - default: - count = -EINVAL; - } - mutex_unlock(&show_mutex); - - return count; -} - -static int tstats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, tstats_show, NULL); -} - -static const struct file_operations tstats_fops = { - .open = tstats_open, - .read = seq_read, - .write = tstats_write, - .llseek = seq_lseek, - .release = single_release, -}; - -void __init init_timer_stats(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); -} - -static int __init init_tstats_procfs(void) -{ - struct proc_dir_entry *pe; - - pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); - if (!pe) - return -ENOMEM; - return 0; -} -__initcall(init_tstats_procfs); -/* - * linux/kernel/time.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c - * - * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() - * 1993-10-08 Torsten Duwe - * adjtime interface update and CMOS clock write code - * 1995-08-13 Torsten Duwe - * kernel PLL updated to 1994-12-13 specs (rfc-1589) - * 1999-01-16 Ulrich Windl - * Introduced error checking for many cases in adjtimex(). - * Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) - * (Even though the technical memorandum forbids it) - * 2004-07-14 Christoph Lameter - * Added getnstimeofday to allow the posix timer functions to return - * with nanosecond accuracy - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "timeconst.h" - -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz; - -EXPORT_SYMBOL(sys_tz); - -#ifdef __ARCH_WANT_SYS_TIME - -/* - * sys_time() can be implemented in user-level using - * sys_gettimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) -{ - time_t i = get_seconds(); - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -/* - * sys_stime() can be implemented in user-level using - * sys_settimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ - -SYSCALL_DEFINE1(stime, time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_SYS_TIME */ - -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) - return -EFAULT; - } - if (unlikely(tz != NULL)) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - struct timespec adjust; - - adjust = current_kernel_time(); - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); -} - -/* - * In case for some reason the CMOS clock has not already been running - * in UTC, but in some local time: The first time we set the timezone, - * we will warp the clock so that it is ticking UTC time instead of - * local time. Presumably, if someone is setting the timezone then we - * are running in an environment where the programs understand about - * timezones. This should be done at boot time in the /etc/rc script, - * as soon as possible, so that the clock can be set right. Otherwise, - * various programs will get confused when the clock gets warped. - */ - -int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) -{ - static int firsttime = 1; - int error = 0; - - if (tv && !timespec_valid(tv)) - return -EINVAL; - - error = security_settime(tv, tz); - if (error) - return error; - - if (tz) { - /* SMP safe, global irq locking makes it work. */ - sys_tz = *tz; - update_vsyscall_tz(); - if (firsttime) { - firsttime = 0; - if (!tv) - warp_clock(); - } - } - if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ - return do_settimeofday(tv); - } - return 0; -} - -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timeval user_tv; - struct timespec new_ts; - struct timezone new_tz; - - if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) -{ - struct timex txc; /* Local copy of parameter */ - int ret; - - /* Copy the user data space into the kernel copy - * structure. But bear in mind that the structures - * may change - */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) - return -EFAULT; - ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; -} - -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - -/* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -inline unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; -# else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -inline unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else -# if BITS_PER_LONG == 32 - return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else - return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { - /* nothing */ - } else if (gran == 1000000000) { - t.tv_nsec = 0; - } else { - t.tv_nsec -= t.tv_nsec % gran; - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. - * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 - * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. - * - * [For the Julian calendar (which was used in Russia before 1917, - * Britain & colonies before 1752, anywhere else before 1582, - * and is still in use by some communities) leave out the - * -year/100+year/400 terms, and add 10.] - * - * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) - */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) -{ - unsigned int mon = mon0, year = year0; - - /* 1..12 -> 11,12,1..10 */ - if (0 >= (int) (mon -= 2)) { - mon += 12; /* Puts Feb last since it has leap day */ - year -= 1; - } - - return ((((unsigned long) - (year/4 - year/100 + year/400 + 367*mon/12 + day) + - year*365 - 719499 - )*24 + hour /* now have hours */ - )*60 + min /* now have minutes */ - )*60 + sec; /* finally seconds */ -} - -EXPORT_SYMBOL(mktime); - -/** - * set_normalized_timespec - set timespec sec and nsec parts and normalize - * - * @ts: pointer to timespec variable to be set - * @sec: seconds to set - * @nsec: nanoseconds to set - * - * Set seconds and nanoseconds field of a timespec variable and - * normalize to the timespec storage format - * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC - * For negative values only the tv_sec field is negative ! - */ -void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) -{ - while (nsec >= NSEC_PER_SEC) { - /* - * The following asm() prevents the compiler from - * optimising this loop into a modulo operation. See - * also __iter_div_u64_rem() in include/linux/time.h - */ - asm("" : "+rm"(nsec)); - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - asm("" : "+rm"(nsec)); - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} -EXPORT_SYMBOL(set_normalized_timespec); - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - -/* - * When we convert to jiffies then we interpret incoming values - * the following way: - * - * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) - * - * - 'too large' values [that would result in larger than - * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. - * - * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. - */ -unsigned long msecs_to_jiffies(const unsigned int m) -{ - /* - * Negative value, means infinite timeout: - */ - if ((int)m < 0) - return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(msecs_to_jiffies); - -unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif -} -EXPORT_SYMBOL(usecs_to_jiffies); - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. - */ -unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} -EXPORT_SYMBOL(timespec_to_jiffies); - -void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_nsec = rem; -} -EXPORT_SYMBOL(jiffies_to_timespec); - -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} -EXPORT_SYMBOL(timeval_to_jiffies); - -void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u32 rem; - - value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, - NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} -EXPORT_SYMBOL(jiffies_to_timeval); - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -clock_t jiffies_to_clock_t(unsigned long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - return x * (USER_HZ / HZ); -# else - return x / (HZ / USER_HZ); -# endif -#else - return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); -#endif -} -EXPORT_SYMBOL(jiffies_to_clock_t); - -unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - return div_u64((u64)x * HZ, USER_HZ); -#endif -} -EXPORT_SYMBOL(clock_t_to_jiffies); - -u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 -# if HZ < USER_HZ - x = div_u64(x * USER_HZ, HZ); -# elif HZ > USER_HZ - x = div_u64(x, HZ / USER_HZ); -# else - /* Nothing to do */ -# endif -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} -EXPORT_SYMBOL(jiffies_64_to_clock_t); - -u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - return div_u64(x, NSEC_PER_SEC / USER_HZ); -#elif (USER_HZ % 512) == 0 - return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); -#endif -} - -/** - * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 - * - * @n: nsecs in u64 - * - * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. - * And this doesn't return MAX_JIFFY_OFFSET since this function is designed - * for scheduler, not for use in device drivers to calculate timeout value. - * - * note: - * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) - * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years - */ -u64 nsecs_to_jiffies64(u64 n) -{ -#if (NSEC_PER_SEC % HZ) == 0 - /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ - return div_u64(n, NSEC_PER_SEC / HZ); -#elif (HZ % 512) == 0 - /* overflow after 292 years if HZ = 1024 */ - return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); -#else - /* - * Generic case - optimized for cases where HZ is a multiple of 3. - * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. - */ - return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); -#endif -} - -/** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies - * - * @n: nsecs in u64 - * - * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. - * And this doesn't return MAX_JIFFY_OFFSET since this function is designed - * for scheduler, not for use in device drivers to calculate timeout value. - * - * note: - * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) - * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years - */ -unsigned long nsecs_to_jiffies(u64 n) -{ - return (unsigned long)nsecs_to_jiffies64(n); -} - -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} -/* - * linux/kernel/timer.c - * - * Kernel internal timers, basic process system calls - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli - * 1999-03-10 Improved NTP compatibility by Ulrich Windl - * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love - * 2000-10-05 Implemented scalable SMP per-CPU timer handling. - * Copyright (C) 2000, 2001, 2002 Ingo Molnar - * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -/* - * per-CPU timer vector definitions: - */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct tvec { - struct list_head vec[TVN_SIZE]; -}; - -struct tvec_root { - struct list_head vec[TVR_SIZE]; -}; - -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - unsigned long next_timer; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; -} ____cacheline_aligned; - -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; - -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); -} - -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) -{ - return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); -} - -static inline void timer_set_deferrable(struct timer_list *timer) -{ - timer->base = TBASE_MAKE_DEFERRED(timer->base); -} - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) -{ - timer->base = (struct tvec_base *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); -} - -static unsigned long round_jiffies_common(unsigned long j, int cpu, - bool force_up) -{ - int rem; - unsigned long original = j; - - /* - * We don't want all cpus firing their timers at once hitting the - * same lock or cachelines, so we skew each extra cpu with an extra - * 3 jiffies. This 3 jiffies came originally from the mm/ code which - * already did this. - * The skew is done by adding 3*cpunr, then round, then subtract this - * extra offset again. - */ - j += cpu * 3; - - rem = j % HZ; - - /* - * If the target jiffie is just after a whole second (which can happen - * due to delays of the timer irq, long irq off times etc etc) then - * we should round down to the whole second, not up. Use 1/4th second - * as cutoff for this rounding as an extreme upper bound for this. - * But never round down if @force_up is set. - */ - if (rem < HZ/4 && !force_up) /* round down */ - j = j - rem; - else /* round up */ - j = j - rem + HZ; - - /* now that we have rounded, subtract the extra skew again */ - j -= cpu * 3; - - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; -} - -/** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** - * __round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, false) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_relative); - -/** - * round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), false); -} -EXPORT_SYMBOL_GPL(round_jiffies); - -/** - * round_jiffies_relative - function to round jiffies to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * round_jiffies_relative() rounds a time delta in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long round_jiffies_relative(unsigned long j) -{ - return __round_jiffies_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_relative); - -/** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** - * __round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) -{ - unsigned long j0 = jiffies; - - /* Use j0 because jiffies might change while we run */ - return round_jiffies_common(j + j0, cpu, true) - j0; -} -EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); - -/** - * round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * - * This is the same as round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up(unsigned long j) -{ - return round_jiffies_common(j, raw_smp_processor_id(), true); -} -EXPORT_SYMBOL_GPL(round_jiffies_up); - -/** - * round_jiffies_up_relative - function to round jiffies up to a full second - * @j: the time in (relative) jiffies that should be rounded - * - * This is the same as round_jiffies_relative() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long round_jiffies_up_relative(unsigned long j) -{ - return __round_jiffies_up_relative(j, raw_smp_processor_id()); -} -EXPORT_SYMBOL_GPL(round_jiffies_up_relative); - -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) -{ - timer->slack = slack_hz; -} -EXPORT_SYMBOL_GPL(set_timer_slack); - -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) -{ - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); - } else { - int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: - */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; - expires = idx + base->timer_jiffies; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; - } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); -} - -#ifdef CONFIG_TIMER_STATS -void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) -{ - if (timer->start_site) - return; - - timer->start_site = addr; - memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); - timer->start_pid = current->pid; -} - -static void timer_stats_account_timer(struct timer_list *timer) -{ - unsigned int flag = 0; - - if (likely(!timer->start_site)) - return; - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; - - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); -} - -#else -static void timer_stats_account_timer(struct timer_list *timer) {} -#endif - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS - -static struct debug_obj_descr timer_debug_descr; - -static void *timer_debug_hint(void *addr) -{ - return ((struct timer_list *) addr)->function; -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_init(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) -{ - WARN_ON(1); -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); - debug_object_free(timer, &timer_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_assert_init is called when: - * - an untracked/uninit-ed object is found - */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) -{ - struct timer_list *timer = addr; - - switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - default: - return 0; - } -} - -static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .debug_hint = timer_debug_hint, - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, - .fixup_assert_init = timer_fixup_assert_init, -}; - -static inline void debug_timer_init(struct timer_list *timer) -{ - debug_object_init(timer, &timer_debug_descr); -} - -static inline void debug_timer_activate(struct timer_list *timer) -{ - debug_object_activate(timer, &timer_debug_descr); -} - -static inline void debug_timer_deactivate(struct timer_list *timer) -{ - debug_object_deactivate(timer, &timer_debug_descr); -} - -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - -static inline void debug_timer_assert_init(struct timer_list *timer) -{ - debug_object_assert_init(timer, &timer_debug_descr); -} - -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key); - -void init_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - debug_object_init_on_stack(timer, &timer_debug_descr); - __init_timer(timer, name, key); -} -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); - -void destroy_timer_on_stack(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); - -#else -static inline void debug_timer_init(struct timer_list *timer) { } -static inline void debug_timer_activate(struct timer_list *timer) { } -static inline void debug_timer_deactivate(struct timer_list *timer) { } -static inline void debug_timer_assert_init(struct timer_list *timer) { } -#endif - -static inline void debug_init(struct timer_list *timer) -{ - debug_timer_init(timer); - trace_timer_init(timer); -} - -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires); -} - -static inline void debug_deactivate(struct timer_list *timer) -{ - debug_timer_deactivate(timer); - trace_timer_cancel(timer); -} - -static inline void debug_assert_init(struct timer_list *timer) -{ - debug_timer_assert_init(timer); -} - -static void __init_timer(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - timer->entry.next = NULL; - timer->base = __raw_get_cpu_var(tvec_bases); - timer->slack = -1; -#ifdef CONFIG_TIMER_STATS - timer->start_site = NULL; - timer->start_pid = -1; - memset(timer->start_comm, 0, TASK_COMM_LEN); -#endif - lockdep_init_map(&timer->lockdep_map, name, key, 0); -} - -void setup_deferrable_timer_on_stack_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key, - void (*function)(unsigned long), - unsigned long data) -{ - timer->function = function; - timer->data = data; - init_timer_on_stack_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); - -/** - * init_timer_key - initialize a timer - * @timer: the timer to be initialized - * @name: name of the timer - * @key: lockdep class key of the fake lock used for tracking timer - * sync lock dependencies - * - * init_timer_key() must be done to a timer prior calling *any* of the - * other timer functions. - */ -void init_timer_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - debug_init(timer); - __init_timer(timer, name, key); -} -EXPORT_SYMBOL(init_timer_key); - -void init_timer_deferrable_key(struct timer_list *timer, - const char *name, - struct lock_class_key *key) -{ - init_timer_key(timer, name, key); - timer_set_deferrable(timer); -} -EXPORT_SYMBOL(init_timer_deferrable_key); - -static inline void detach_timer(struct timer_list *timer, - int clear_pending) -{ - struct list_head *entry = &timer->entry; - - debug_deactivate(timer); - - __list_del(entry->prev, entry->next); - if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; -} - -/* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. - * - * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. - * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. - */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) - __acquires(timer->base->lock) -{ - struct tvec_base *base; - - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) - return base; - /* The timer has migrated to another CPU */ - spin_unlock_irqrestore(&base->lock, *flags); - } - cpu_relax(); - } -} - -static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) -{ - struct tvec_base *base, *new_base; - unsigned long flags; - int ret = 0 , cpu; - - timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); - - if (timer_pending(timer)) { - detach_timer(timer, 0); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } else { - if (pending_only) - goto out_unlock; - } - - debug_activate(timer, expires); - - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif - new_base = per_cpu(tvec_bases, cpu); - - if (base != new_base) { - /* - * We are trying to schedule the timer on the local CPU. - * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. - */ - if (likely(base->running_timer != timer)) { - /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); - spin_unlock(&base->lock); - base = new_base; - spin_lock(&base->lock); - timer_set_base(timer, base); - } - } - - timer->expires = expires; - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; - internal_add_timer(base, timer); - -out_unlock: - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} - -/** - * mod_timer_pending - modify a pending timer's timeout - * @timer: the pending timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pending() is the same for pending timers as mod_timer(), - * but will not re-activate and modify already deleted timers. - * - * It is useful for unserialized use of timers. - */ -int mod_timer_pending(struct timer_list *timer, unsigned long expires) -{ - return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); -} -EXPORT_SYMBOL(mod_timer_pending); - -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - * 1) calculate the maximum (absolute) time - * 2) calculate the highest bit where the expires and new max are different - * 3) use this bit to make a mask - * 4) use the bitmask to round down the maximum time, so that all last - * bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ - unsigned long expires_limit, mask; - int bit; - - if (timer->slack >= 0) { - expires_limit = expires + timer->slack; - } else { - long delta = expires - jiffies; - - if (delta < 256) - return expires; - - expires_limit = expires + delta / 256; - } - mask = expires ^ expires_limit; - if (mask == 0) - return expires; - - bit = find_last_bit(&mask, BITS_PER_LONG); - - mask = (1 << bit) - 1; - - expires_limit = expires_limit & ~(mask); - - return expires_limit; -} - -/** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * - * mod_timer(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - * - * Note that if there are multiple unserialized concurrent users of the - * same timer, then mod_timer() is the only safe way to modify the timeout, - * since add_timer() cannot modify an already running timer. - * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) - */ -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - expires = apply_slack(timer, expires); - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer_pending(timer) && timer->expires == expires) - return 1; - - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); -} -EXPORT_SYMBOL(mod_timer); - -/** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and not allow the timer to be migrated to a different CPU. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires, false, TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - -/** - * add_timer - start a timer - * @timer: the timer to be added - * - * The kernel will do a ->function(->data) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. - * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. - * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. - */ -void add_timer(struct timer_list *timer) -{ - BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); -} -EXPORT_SYMBOL(add_timer); - -/** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on - * - * This is not very scalable on SMP. Double adds are not possible. - */ -void add_timer_on(struct timer_list *timer, int cpu) -{ - struct tvec_base *base = per_cpu(tvec_bases, cpu); - unsigned long flags; - - timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); - debug_activate(timer, timer->expires); - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; - internal_add_timer(base, timer); - /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. - */ - wake_up_idle_cpu(cpu); - spin_unlock_irqrestore(&base->lock, flags); -} -EXPORT_SYMBOL_GPL(add_timer_on); - -/** - * del_timer - deactive a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) - */ -int del_timer(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = 0; - - debug_assert_init(timer); - - timer_stats_timer_clear_start_info(timer); - if (timer_pending(timer)) { - base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } - spin_unlock_irqrestore(&base->lock, flags); - } - - return ret; -} -EXPORT_SYMBOL(del_timer); - -/** - * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del - * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. - */ -int try_to_del_timer_sync(struct timer_list *timer) -{ - struct tvec_base *base; - unsigned long flags; - int ret = -1; - - debug_assert_init(timer); - - base = lock_timer_base(timer, &flags); - - if (base->running_timer == timer) - goto out; - - timer_stats_timer_clear_start_info(timer); - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } -out: - spin_unlock_irqrestore(&base->lock, flags); - - return ret; -} -EXPORT_SYMBOL(try_to_del_timer_sync); - -#ifdef CONFIG_SMP -/** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: Callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. The timer's handler must not call - * add_timer_on(). Upon exit the timer is not queued and the handler is - * not running on any CPU. - * - * Note: You must not hold locks that are held in interrupt context - * while calling this function. Even if the lock has nothing to do - * with the timer in question. Here's why: - * - * CPU0 CPU1 - * ---- ---- - * - * call_timer_fn(); - * base->running_timer = mytimer; - * spin_lock_irq(somelock); - * - * spin_lock(somelock); - * del_timer_sync(mytimer); - * while (base->running_timer == mytimer); - * - * Now del_timer_sync() will never return and never release somelock. - * The interrupt on the other CPU is waiting to grab somelock but - * it has interrupted the softirq that CPU0 is waiting to finish. - * - * The function returns whether it has deactivated a pending timer or not. - */ -int del_timer_sync(struct timer_list *timer) -{ -#ifdef CONFIG_LOCKDEP - unsigned long flags; - - /* - * If lockdep gives a backtrace here, please reference - * the synchronization rules above. - */ - local_irq_save(flags); - lock_map_acquire(&timer->lockdep_map); - lock_map_release(&timer->lockdep_map); - local_irq_restore(flags); -#endif - /* - * don't use it in hardirq context, because it - * could lead to deadlock. - */ - WARN_ON(in_irq()); - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } -} -EXPORT_SYMBOL(del_timer_sync); -#endif - -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; - - list_replace_init(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); - } - - return index; -} - -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), - unsigned long data) -{ - int preempt_count = preempt_count(); - -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the timer from inside the - * function that is called from it, this we need to take into - * account for lockdep too. To avoid bogus "held lock freed" - * warnings as well as problems when looking into - * timer->lockdep_map, make a copy and use that here. - */ - struct lockdep_map lockdep_map = timer->lockdep_map; -#endif - /* - * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map around the fn() - * call here and in del_timer_sync(). - */ - lock_map_acquire(&lockdep_map); - - trace_timer_expire_entry(timer); - fn(data); - trace_timer_expire_exit(timer); - - lock_map_release(&lockdep_map); - - if (preempt_count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); - /* - * Restore the preempt count. That gives us a decent - * chance to survive and extract information. If the - * callback kept a lock held, bad luck, but not worse - * than the BUG() we had. - */ - preempt_count() = preempt_count; - } -} - -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) -{ - struct timer_list *timer; - - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; - - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); - while (!list_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - - timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; - - timer_stats_account_timer(timer); - - base->running_timer = timer; - detach_timer(timer, 1); - - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); - } - } - base->running_timer = NULL; - spin_unlock_irq(&base->lock); -} - -#ifdef CONFIG_NO_HZ -/* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. - */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; - do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; - } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = timer_jiffies & TVN_MASK; - do { - list_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; - - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; - } - return expires; -} - -/* - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) -{ - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; - - /* - * Expired timer available, let it expire in the next tick - */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); - - /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): - */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; - - /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq - */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; -} - -/** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) - */ -unsigned long get_next_timer_interrupt(unsigned long now) -{ - struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires; - - /* - * Pretend that there is no timer pending if the cpu is offline. - * Possible pending timers will be migrated later to an active cpu. - */ - if (cpu_is_offline(smp_processor_id())) - return now + NEXT_TIMER_MAX_DELTA; - spin_lock(&base->lock); - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; - spin_unlock(&base->lock); - - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); -} -#endif - -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - rcu_check_callbacks(cpu, user_tick); - printk_tick(); -#ifdef CONFIG_IRQ_WORK - if (in_irq()) - irq_work_run(); -#endif - scheduler_tick(); - run_posix_cpu_timers(p); -} - -/* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __this_cpu_read(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* - * Called by the local, per-CPU timer interrupt on SMP. - */ -void run_local_timers(void) -{ - hrtimer_run_queues(); - raise_softirq(TIMER_SOFTIRQ); -} - -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -SYSCALL_DEFINE0(getpid) -{ - return task_tgid_vnr(current); -} - -/* - * Accessing ->real_parent is not SMP-safe, it could - * change from under us. However, we can use a stale - * value of ->real_parent under rcu_read_lock(), see - * release_task()->call_rcu(delayed_put_task_struct). - */ -SYSCALL_DEFINE0(getppid) -{ - int pid; - - rcu_read_lock(); - pid = task_tgid_vnr(rcu_dereference(current->real_parent)); - rcu_read_unlock(); - - return pid; -} - -SYSCALL_DEFINE0(getuid) -{ - /* Only we change this so SMP safe */ - return current_uid(); -} - -SYSCALL_DEFINE0(geteuid) -{ - /* Only we change this so SMP safe */ - return current_euid(); -} - -SYSCALL_DEFINE0(getgid) -{ - /* Only we change this so SMP safe */ - return current_gid(); -} - -SYSCALL_DEFINE0(getegid) -{ - /* Only we change this so SMP safe */ - return current_egid(); -} - -#endif - -static void process_timeout(unsigned long __data) -{ - wake_up_process((struct task_struct *)__data); -} - -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. - */ -signed long __sched schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx\n", timeout); - dump_stack(); - current->state = TASK_RUNNING; - goto out; - } - } - - expire = timeout + jiffies; - - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); - schedule(); - del_singleshot_timer_sync(&timer); - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; -} -EXPORT_SYMBOL(schedule_timeout); - -/* - * We can use __set_current_state() here because schedule_timeout() calls - * schedule() unconditionally. - */ -signed long __sched schedule_timeout_interruptible(signed long timeout) -{ - __set_current_state(TASK_INTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_interruptible); - -signed long __sched schedule_timeout_killable(signed long timeout) -{ - __set_current_state(TASK_KILLABLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_killable); - -signed long __sched schedule_timeout_uninterruptible(signed long timeout) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - return schedule_timeout(timeout); -} -EXPORT_SYMBOL(schedule_timeout_uninterruptible); - -/* Thread ID - the internal kernel "pid" */ -SYSCALL_DEFINE0(gettid) -{ - return task_pid_vnr(current); -} - -/** - * do_sysinfo - fill in sysinfo struct - * @info: pointer to buffer to fill - */ -int do_sysinfo(struct sysinfo *info) -{ - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - struct timespec tp; - - memset(info, 0, sizeof(struct sysinfo)); - - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - - si_meminfo(info); - si_swapinfo(info); - - /* - * If the sum of all the available memory (i.e. ram + swap) - * is less than can be stored in a 32 bit unsigned long then - * we can be binary compatible with 2.2.x kernels. If not, - * well, in that case 2.2.x was broken anyways... - * - * -Erik Andersen - */ - - mem_total = info->totalram + info->totalswap; - if (mem_total < info->totalram || mem_total < info->totalswap) - goto out; - bitcount = 0; - mem_unit = info->mem_unit; - while (mem_unit > 1) { - bitcount++; - mem_unit >>= 1; - sav_total = mem_total; - mem_total <<= 1; - if (mem_total < sav_total) - goto out; - } - - /* - * If mem_total did not overflow, multiply all memory values by - * info->mem_unit and set it to 1. This leaves things compatible - * with 2.2.x, and also retains compatibility with earlier 2.4.x - * kernels... - */ - - info->mem_unit = 1; - info->totalram <<= bitcount; - info->freeram <<= bitcount; - info->sharedram <<= bitcount; - info->bufferram <<= bitcount; - info->totalswap <<= bitcount; - info->freeswap <<= bitcount; - info->totalhigh <<= bitcount; - info->freehigh <<= bitcount; - -out: - return 0; -} - -SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) -{ - struct sysinfo val; - - do_sysinfo(&val); - - if (copy_to_user(info, &val, sizeof(struct sysinfo))) - return -EFAULT; - - return 0; -} - -static int __cpuinit init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - tvec_base_done[cpu] = 1; - } else { - base = per_cpu(tvec_bases, cpu); - } - - spin_lock_init(&base->lock); - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) -{ - struct timer_list *timer; - - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); - timer_set_base(timer, new_base); - if (time_before(timer->expires, new_base->next_timer) && - !tbase_get_deferrable(timer->base)) - new_base->next_timer = timer->expires; - internal_add_timer(new_base, timer); - } -} - -static void __cpuinit migrate_timers(int cpu) -{ - struct tvec_base *old_base; - struct tvec_base *new_base; - int i; - - BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __cpuinit timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - int err; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = init_timers_cpu(cpu); - if (err < 0) - return notifier_from_errno(err); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers(cpu); - break; -#endif - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata timers_nb = { - .notifier_call = timer_cpu_notify, -}; - - -void __init init_timers(void) -{ - int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - - init_timer_stats(); - - BUG_ON(err != NOTIFY_OK); - register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq); -} - -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -void msleep(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -} - -EXPORT_SYMBOL(msleep); - -/** - * msleep_interruptible - sleep waiting for signals - * @msecs: Time in milliseconds to sleep for - */ -unsigned long msleep_interruptible(unsigned int msecs) -{ - unsigned long timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); - return jiffies_to_msecs(timeout); -} - -EXPORT_SYMBOL(msleep_interruptible); - -static int __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - unsigned long delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - -/** - * usleep_range - Drop in replacement for udelay where wakeup is flexible - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep - */ -void usleep_range(unsigned long min, unsigned long max) -{ - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); -} -EXPORT_SYMBOL(usleep_range); -/* - * Copyright (C) 2006 Jens Axboe - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" - -#ifdef CONFIG_BLK_DEV_IO_TRACE - -static unsigned int blktrace_seq __read_mostly = 1; - -static struct trace_array *blk_tr; -static bool blk_tracer_enabled __read_mostly; - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_BLK_OPT_CLASSIC 0x1 - -static struct tracer_opt blk_tracer_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, - { } -}; - -static struct tracer_flags blk_tracer_flags = { - .val = 0, - .opts = blk_tracer_opts, -}; - -/* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); - -static void blk_register_tracepoints(void); -static void blk_unregister_tracepoints(void); - -/* - * Send out a notify message. - */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) -{ - struct blk_io_trace *t; - struct ring_buffer_event *event = NULL; - struct ring_buffer *buffer = NULL; - int pc = 0; - int cpu = smp_processor_id(); - bool blk_tracer = blk_tracer_enabled; - - if (blk_tracer) { - buffer = blk_tr->buffer; - pc = preempt_count(); - event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, - 0, pc); - if (!event) - return; - t = ring_buffer_event_data(event); - goto record_it; - } - - if (!bt->rchan) - return; - - t = relay_reserve(bt->rchan, sizeof(*t) + len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action; - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(buffer, event, 0, pc); - } -} - -/* - * Send out a notify for this process, if we haven't done so since a trace - * started - */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) -{ - tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); -} - -static void trace_note_time(struct blk_trace *bt) -{ - struct timespec now; - unsigned long flags; - u32 words[2]; - - getnstimeofday(&now); - words[0] = now.tv_sec; - words[1] = now.tv_nsec; - - local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); - local_irq_restore(flags); -} - -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) -{ - int n; - va_list args; - unsigned long flags; - char *buf; - - if (unlikely(bt->trace_state != Blktrace_running && - !blk_tracer_enabled)) - return; - - /* - * If the BLK_TC_NOTIFY action mask isn't set, don't send any note - * message to the trace. - */ - if (!(bt->act_mask & BLK_TC_NOTIFY)) - return; - - local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); - va_start(args, fmt); - n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); - va_end(args); - - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(__trace_note_message); - -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, - pid_t pid) -{ - if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) - return 1; - if (sector && (sector < bt->start_lba || sector > bt->end_lba)) - return 1; - if (bt->pid && pid != bt->pid) - return 1; - - return 0; -} - -/* - * Data direction bit lookup - */ -static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; - -#define BLK_TC_RAHEAD BLK_TC_AHEAD - -/* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) - -/* - * The worker for the various blk_add_trace*() types. Fills out a - * blk_io_trace structure and places it in a per-cpu subbuffer. - */ -static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) -{ - struct task_struct *tsk = current; - struct ring_buffer_event *event = NULL; - struct ring_buffer *buffer = NULL; - struct blk_io_trace *t; - unsigned long flags = 0; - unsigned long *sequence; - pid_t pid; - int cpu, pc = 0; - bool blk_tracer = blk_tracer_enabled; - - if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) - return; - - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, RAHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - what |= MASK_TC_BIT(rw, FLUSH); - what |= MASK_TC_BIT(rw, FUA); - - pid = tsk->pid; - if (act_log_check(bt, what, sector, pid)) - return; - cpu = raw_smp_processor_id(); - - if (blk_tracer) { - tracing_record_cmdline(current); - - buffer = blk_tr->buffer; - pc = preempt_count(); - event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, - 0, pc); - if (!event) - return; - t = ring_buffer_event_data(event); - goto record_it; - } - - /* - * A word about the locking here - we disable interrupts to reserve - * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. - */ - local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len; - - if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(buffer, event, 0, pc); - return; - } - } - - local_irq_restore(flags); -} - -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - -static void blk_trace_free(struct blk_trace *bt) -{ - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); - relay_close(bt->rchan); - debugfs_remove(bt->dir); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - kfree(bt); -} - -static void blk_trace_cleanup(struct blk_trace *bt) -{ - blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); -} - -int blk_trace_remove(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (!bt) - return -EINVAL; - - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_remove); - -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct blk_trace *bt = filp->private_data; - char buf[16]; - - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); - - return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); -} - -static const struct file_operations blk_dropped_fops = { - .owner = THIS_MODULE, - .open = blk_dropped_open, - .read = blk_dropped_read, - .llseek = default_llseek, -}; - -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *msg; - struct blk_trace *bt; - - if (count >= BLK_TN_MAX_MSG) - return -EINVAL; - - msg = kmalloc(count + 1, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } - - msg[count] = '\0'; - bt = filp->private_data; - __trace_note_message(bt, "%s", msg); - kfree(msg); - - return count; -} - -static const struct file_operations blk_msg_fops = { - .owner = THIS_MODULE, - .open = blk_msg_open, - .write = blk_msg_write, - .llseek = noop_llseek, -}; - -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - -static int blk_remove_buf_file_callback(struct dentry *dentry) -{ - debugfs_remove(dentry); - - return 0; -} - -static struct dentry *blk_create_buf_file_callback(const char *filename, - struct dentry *parent, - umode_t mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, - .create_buf_file = blk_create_buf_file_callback, - .remove_buf_file = blk_remove_buf_file_callback, -}; - -static void blk_trace_setup_lba(struct blk_trace *bt, - struct block_device *bdev) -{ - struct hd_struct *part = NULL; - - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; - } else { - bt->start_lba = 0; - bt->end_lba = -1ULL; - } -} - -/* - * Setup everything required to start tracing - */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) -{ - struct blk_trace *old_bt, *bt = NULL; - struct dentry *dir = NULL; - int ret, i; - - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; - - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - return -ENOMEM; - - ret = -ENOMEM; - bt->sequence = alloc_percpu(unsigned long); - if (!bt->sequence) - goto err; - - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); - if (!bt->msg_data) - goto err; - - ret = -ENOENT; - - mutex_lock(&blk_tree_mutex); - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) { - mutex_unlock(&blk_tree_mutex); - goto err; - } - } - mutex_unlock(&blk_tree_mutex); - - dir = debugfs_create_dir(buts->name, blk_tree_root); - - if (!dir) - goto err; - - bt->dir = dir; - bt->dev = dev; - atomic_set(&bt->dropped, 0); - - ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, - &blk_dropped_fops); - if (!bt->dropped_file) - goto err; - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; - - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); - if (!bt->rchan) - goto err; - - bt->act_mask = buts->act_mask; - if (!bt->act_mask) - bt->act_mask = (u16) -1; - - blk_trace_setup_lba(bt, bdev); - - /* overwrite with user settings */ - if (buts->start_lba) - bt->start_lba = buts->start_lba; - if (buts->end_lba) - bt->end_lba = buts->end_lba; - - bt->pid = buts->pid; - bt->trace_state = Blktrace_setup; - - ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); - goto err; - } - - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); - - return 0; -err: - blk_trace_free(bt); - return ret; -} - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) -{ - struct blk_user_trace_setup buts; - int ret; - - ret = copy_from_user(&buts, arg, sizeof(buts)); - if (ret) - return -EFAULT; - - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); - if (ret) - return ret; - - if (copy_to_user(arg, &buts, sizeof(buts))) { - blk_trace_remove(q); - return -EFAULT; - } - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_setup); - -#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) -static int compat_blk_trace_setup(struct request_queue *q, char *name, - dev_t dev, struct block_device *bdev, - char __user *arg) -{ - struct blk_user_trace_setup buts; - struct compat_blk_user_trace_setup cbuts; - int ret; - - if (copy_from_user(&cbuts, arg, sizeof(cbuts))) - return -EFAULT; - - buts = (struct blk_user_trace_setup) { - .act_mask = cbuts.act_mask, - .buf_size = cbuts.buf_size, - .buf_nr = cbuts.buf_nr, - .start_lba = cbuts.start_lba, - .end_lba = cbuts.end_lba, - .pid = cbuts.pid, - }; - memcpy(&buts.name, &cbuts.name, 32); - - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); - if (ret) - return ret; - - if (copy_to_user(arg, &buts.name, 32)) { - blk_trace_remove(q); - return -EFAULT; - } - - return 0; -} -#endif - -int blk_trace_startstop(struct request_queue *q, int start) -{ - int ret; - struct blk_trace *bt = q->blk_trace; - - if (bt == NULL) - return -EINVAL; - - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(blk_trace_startstop); - -/** - * blk_trace_ioctl: - handle the ioctls associated with tracing - * @bdev: the block device - * @cmd: the ioctl cmd - * @arg: the argument data, if any - * - **/ -int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) -{ - struct request_queue *q; - int ret, start = 0; - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - mutex_lock(&bdev->bd_mutex); - - switch (cmd) { - case BLKTRACESETUP: - bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); - break; -#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) - case BLKTRACESETUP32: - bdevname(bdev, b); - ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); - break; -#endif - case BLKTRACESTART: - start = 1; - case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); - break; - case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); - break; - default: - ret = -ENOTTY; - break; - } - - mutex_unlock(&bdev->bd_mutex); - return ret; -} - -/** - * blk_trace_shutdown: - stop and cleanup trace structures - * @q: the request queue associated with the device - * - **/ -void blk_trace_shutdown(struct request_queue *q) -{ - if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); - } -} - -/* - * blktrace probes - */ - -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); - } -} - -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ABORT); -} - -static void blk_add_trace_rq_insert(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_INSERT); -} - -static void blk_add_trace_rq_issue(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); -} - -static void blk_add_trace_rq_requeue(void *ignore, - struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); -} - -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * @error: error, if any - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (!error && !bio_flagged(bio, BIO_UPTODATE)) - error = EIO; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - error, 0, NULL); -} - -static void blk_add_trace_bio_bounce(void *ignore, - struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); -} - -static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) -{ - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); -} - -static void blk_add_trace_bio_backmerge(void *ignore, - struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); -} - -static void blk_add_trace_bio_frontmerge(void *ignore, - struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); -} - -static void blk_add_trace_bio_queue(void *ignore, - struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); -} - -static void blk_add_trace_getrq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); - } -} - - -static void blk_add_trace_sleeprq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, - 0, 0, NULL); - } -} - -static void blk_add_trace_plug(void *ignore, struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); -} - -static void blk_add_trace_unplug(void *ignore, struct request_queue *q, - unsigned int depth, bool explicit) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - __be64 rpdu = cpu_to_be64(depth); - u32 what; - - if (explicit) - what = BLK_TA_UNPLUG_IO; - else - what = BLK_TA_UNPLUG_TIMER; - - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_split(void *ignore, - struct request_queue *q, struct bio *bio, - unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); - } -} - -/** - * blk_add_trace_bio_remap - Add a trace for a bio-remap operation - * @ignore: trace callback data parameter (not used) - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static void blk_add_trace_bio_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector_from = cpu_to_be64(from); - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), - sizeof(r), &r); -} - -/** - * blk_add_trace_rq_remap - Add a trace for a request-remap operation - * @ignore: trace callback data parameter (not used) - * @q: queue the io is for - * @rq: the source request - * @dev: target device - * @from: source sector - * - * Description: - * Device mapper remaps request to other devices. - * Add a trace for that action. - * - **/ -static void blk_add_trace_rq_remap(void *ignore, - struct request_queue *q, - struct request *rq, dev_t dev, - sector_t from) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); - r.sector_from = cpu_to_be64(from); - - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, - sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); - else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, - BLK_TA_DRV_DATA, rq->errors, len, data); -} -EXPORT_SYMBOL_GPL(blk_add_driver_data); - -static void blk_register_tracepoints(void) -{ - int ret; - - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); - WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); - WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); - WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); - WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); - WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug, NULL); - WARN_ON(ret); - ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); - WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split, NULL); - WARN_ON(ret); - ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); - WARN_ON(ret); - ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - WARN_ON(ret); -} - -static void blk_unregister_tracepoints(void) -{ - unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); - unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug(blk_add_trace_unplug, NULL); - unregister_trace_block_plug(blk_add_trace_plug, NULL); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); - unregister_trace_block_getrq(blk_add_trace_getrq, NULL); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - - tracepoint_synchronize_unregister(); -} - -/* - * struct blk_io_tracer formatting routines - */ - -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) -{ - int i = 0; - int tc = t->action >> BLK_TC_SHIFT; - - if (t->action == BLK_TN_MESSAGE) { - rwbs[i++] = 'N'; - goto out; - } - - if (tc & BLK_TC_FLUSH) - rwbs[i++] = 'F'; - - if (tc & BLK_TC_DISCARD) - rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) - rwbs[i++] = 'W'; - else if (t->bytes) - rwbs[i++] = 'R'; - else - rwbs[i++] = 'N'; - - if (tc & BLK_TC_FUA) - rwbs[i++] = 'F'; - if (tc & BLK_TC_AHEAD) - rwbs[i++] = 'A'; - if (tc & BLK_TC_SYNC) - rwbs[i++] = 'S'; - if (tc & BLK_TC_META) - rwbs[i++] = 'M'; -out: - rwbs[i] = '\0'; -} - -static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) -{ - return (const struct blk_io_trace *)ent; -} - -static inline const void *pdu_start(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent) + 1; -} - -static inline u32 t_action(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->action; -} - -static inline u32 t_bytes(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->bytes; -} - -static inline u32 t_sec(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->bytes >> 9; -} - -static inline unsigned long long t_sector(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->sector; -} - -static inline __u16 t_error(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->error; -} - -static __u64 get_pdu_int(const struct trace_entry *ent) -{ - const __u64 *val = pdu_start(ent); - return be64_to_cpu(*val); -} - -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector_from = __r->sector_from; - - r->device_from = be32_to_cpu(__r->device_from); - r->device_to = be32_to_cpu(__r->device_to); - r->sector_from = be64_to_cpu(sector_from); -} - -typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); - -static int blk_log_action_classic(struct trace_iterator *iter, const char *act) -{ - char rwbs[RWBS_LEN]; - unsigned long long ts = iter->ts; - unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); - unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); - - fill_rwbs(rwbs, t); - - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, nsec_rem, iter->ent->pid, act, rwbs); -} - -static int blk_log_action(struct trace_iterator *iter, const char *act) -{ - char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); - - fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) -{ - const unsigned char *pdu_buf; - int pdu_len; - int i, end, ret; - - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; - - if (!pdu_len) - return 1; - - /* find the last zero that needs to be printed */ - for (end = pdu_len - 1; end >= 0; end--) - if (pdu_buf[end]) - break; - end++; - - if (!trace_seq_putc(s, '(')) - return 0; - - for (i = 0; i < pdu_len; i++) { - - ret = trace_seq_printf(s, "%s%02x", - i == 0 ? "" : " ", pdu_buf[i]); - if (!ret) - return ret; - - /* - * stop when the rest is just zeroes and indicate so - * with a ".." appended - */ - if (i == end && end != pdu_len - 1) - return trace_seq_puts(s, " ..) "); - } - - return trace_seq_puts(s, ") "); -} - -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) -{ - char cmd[TASK_COMM_LEN]; - - trace_find_cmdline(ent->pid, cmd); - - if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = trace_seq_printf(s, "%u ", t_bytes(ent)); - if (!ret) - return 0; - ret = blk_log_dump_pdu(s, ent); - if (!ret) - return 0; - return trace_seq_printf(s, "[%s]\n", cmd); - } else { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); - } -} - -static int blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) -{ - if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = blk_log_dump_pdu(s, ent); - if (ret) - return trace_seq_printf(s, "[%d]\n", t_error(ent)); - return 0; - } else { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", - t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", - t_sector(ent), t_error(ent)); - } -} - -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) -{ - struct blk_io_trace_remap r = { .device_from = 0, }; - - get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); -} - -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) -{ - char cmd[TASK_COMM_LEN]; - - trace_find_cmdline(ent->pid, cmd); - - return trace_seq_printf(s, "[%s]\n", cmd); -} - -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) -{ - char cmd[TASK_COMM_LEN]; - - trace_find_cmdline(ent->pid, cmd); - - return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); -} - -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) -{ - char cmd[TASK_COMM_LEN]; - - trace_find_cmdline(ent->pid, cmd); - - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); -} - -static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) -{ - int ret; - const struct blk_io_trace *t = te_blk_io_trace(ent); - - ret = trace_seq_putmem(s, t + 1, t->pdu_len); - if (ret) - return trace_seq_putc(s, '\n'); - return ret; -} - -/* - * struct tracer operations - */ - -static void blk_tracer_print_header(struct seq_file *m) -{ - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return; - seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" - "# | | | | | |\n"); -} - -static void blk_tracer_start(struct trace_array *tr) -{ - blk_tracer_enabled = true; -} - -static int blk_tracer_init(struct trace_array *tr) -{ - blk_tr = tr; - blk_tracer_start(tr); - return 0; -} - -static void blk_tracer_stop(struct trace_array *tr) -{ - blk_tracer_enabled = false; -} - -static void blk_tracer_reset(struct trace_array *tr) -{ - blk_tracer_stop(tr); -} - -static const struct { - const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] = { - [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, - [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, - [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, - [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, - [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, - [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, - [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, - [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, - [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, - [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, - [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, - [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, - [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, - [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, -}; - -static enum print_line_t print_one_line(struct trace_iterator *iter, - bool classic) -{ - struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; - u16 what; - int ret; - bool long_act; - blk_log_action_t *log_action; - - t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); - long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - log_action = classic ? &blk_log_action_classic : &blk_log_action; - - if (t->action == BLK_TN_MESSAGE) { - ret = log_action(iter, long_act ? "message" : "m"); - if (ret) - ret = blk_log_msg(s, iter->ent); - goto out; - } - - if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Unknown action %x\n", what); - else { - ret = log_action(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); - } -out: - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - return print_one_line(iter, false); -} - -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); - struct blk_io_trace old = { - .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, - .time = iter->ts, - }; - - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); -} - -static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) -{ - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return TRACE_TYPE_UNHANDLED; - - return print_one_line(iter, true); -} - -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) -{ - /* don't output context-info for blk_classic output */ - if (bit == TRACE_BLK_OPT_CLASSIC) { - if (set) - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; - else - trace_flags |= TRACE_ITER_CONTEXT_INFO; - } - return 0; -} - -static struct tracer blk_tracer __read_mostly = { - .name = "blk", - .init = blk_tracer_init, - .reset = blk_tracer_reset, - .start = blk_tracer_start, - .stop = blk_tracer_stop, - .print_header = blk_tracer_print_header, - .print_line = blk_tracer_print_line, - .flags = &blk_tracer_flags, - .set_flag = blk_tracer_set_flag, -}; - -static struct trace_event_functions trace_blk_event_funcs = { - .trace = blk_trace_event_print, - .binary = blk_trace_event_print_binary, -}; - -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, - .funcs = &trace_blk_event_funcs, -}; - -static int __init init_blk_tracer(void) -{ - if (!register_ftrace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); - return 1; - } - - if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); - unregister_ftrace_event(&trace_blk_event); - return 1; - } - - return 0; -} - -device_initcall(init_blk_tracer); - -static int blk_trace_remove_queue(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (bt == NULL) - return -EINVAL; - - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - - blk_trace_free(bt); - return 0; -} - -/* - * Setup everything required to start tracing - */ -static int blk_trace_setup_queue(struct request_queue *q, - struct block_device *bdev) -{ - struct blk_trace *old_bt, *bt = NULL; - int ret = -ENOMEM; - - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - return -ENOMEM; - - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); - if (!bt->msg_data) - goto free_bt; - - bt->dev = bdev->bd_dev; - bt->act_mask = (u16)-1; - - blk_trace_setup_lba(bt, bdev); - - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - ret = -EBUSY; - goto free_bt; - } - - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); - return 0; - -free_bt: - blk_trace_free(bt); - return ret; -} - -/* - * sysfs interface to enable and configure tracing - */ - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf); -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count); -#define BLK_TRACE_DEVICE_ATTR(_name) \ - DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ - sysfs_blk_trace_attr_show, \ - sysfs_blk_trace_attr_store) - -static BLK_TRACE_DEVICE_ATTR(enable); -static BLK_TRACE_DEVICE_ATTR(act_mask); -static BLK_TRACE_DEVICE_ATTR(pid); -static BLK_TRACE_DEVICE_ATTR(start_lba); -static BLK_TRACE_DEVICE_ATTR(end_lba); - -static struct attribute *blk_trace_attrs[] = { - &dev_attr_enable.attr, - &dev_attr_act_mask.attr, - &dev_attr_pid.attr, - &dev_attr_start_lba.attr, - &dev_attr_end_lba.attr, - NULL -}; - -struct attribute_group blk_trace_attr_group = { - .name = "trace", - .attrs = blk_trace_attrs, -}; - -static const struct { - int mask; - const char *str; -} mask_maps[] = { - { BLK_TC_READ, "read" }, - { BLK_TC_WRITE, "write" }, - { BLK_TC_FLUSH, "flush" }, - { BLK_TC_SYNC, "sync" }, - { BLK_TC_QUEUE, "queue" }, - { BLK_TC_REQUEUE, "requeue" }, - { BLK_TC_ISSUE, "issue" }, - { BLK_TC_COMPLETE, "complete" }, - { BLK_TC_FS, "fs" }, - { BLK_TC_PC, "pc" }, - { BLK_TC_AHEAD, "ahead" }, - { BLK_TC_META, "meta" }, - { BLK_TC_DISCARD, "discard" }, - { BLK_TC_DRV_DATA, "drv_data" }, - { BLK_TC_FUA, "fua" }, -}; - -static int blk_trace_str2mask(const char *str) -{ - int i; - int mask = 0; - char *buf, *s, *token; - - buf = kstrdup(str, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; - s = strstrip(buf); - - while (1) { - token = strsep(&s, ","); - if (token == NULL) - break; - - if (*token == '\0') - continue; - - for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { - if (strcasecmp(token, mask_maps[i].str) == 0) { - mask |= mask_maps[i].mask; - break; - } - } - if (i == ARRAY_SIZE(mask_maps)) { - mask = -EINVAL; - break; - } - } - kfree(buf); - - return mask; -} - -static ssize_t blk_trace_mask2str(char *buf, int mask) -{ - int i; - char *p = buf; - - for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { - if (mask & mask_maps[i].mask) { - p += sprintf(p, "%s%s", - (p == buf) ? "" : ",", mask_maps[i].str); - } - } - *p++ = '\n'; - - return p - buf; -} - -static struct request_queue *blk_trace_get_queue(struct block_device *bdev) -{ - if (bdev->bd_disk == NULL) - return NULL; - - return bdev_get_queue(bdev); -} - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q; - struct block_device *bdev; - ssize_t ret = -ENXIO; - - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - - if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); - goto out_unlock_bdev; - } - - if (q->blk_trace == NULL) - ret = sprintf(buf, "disabled\n"); - else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); - else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); - else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); - else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); - -out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out: - return ret; -} - -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - u64 value; - ssize_t ret = -EINVAL; - - if (count == 0) - goto out; - - if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { - /* Assume it is a list of trace category names */ - ret = blk_trace_str2mask(buf); - if (ret < 0) - goto out; - value = ret; - } - } else if (sscanf(buf, "%llu", &value) != 1) - goto out; - - ret = -ENXIO; - - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - - if (attr == &dev_attr_enable) { - if (value) - ret = blk_trace_setup_queue(q, bdev); - else - ret = blk_trace_remove_queue(q); - goto out_unlock_bdev; - } - - ret = 0; - if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev); - - if (ret == 0) { - if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; - else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; - else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; - else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; - } - -out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out: - return ret ? ret : count; -} - -int blk_trace_init_sysfs(struct device *dev) -{ - return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); -} - -void blk_trace_remove_sysfs(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); -} - -#endif /* CONFIG_BLK_DEV_IO_TRACE */ - -#ifdef CONFIG_EVENT_TRACING - -void blk_dump_cmd(char *buf, struct request *rq) -{ - int i, end; - int len = rq->cmd_len; - unsigned char *cmd = rq->cmd; - - if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { - buf[0] = '\0'; - return; - } - - for (end = len - 1; end >= 0; end--) - if (cmd[end]) - break; - end++; - - for (i = 0; i < len; i++) { - buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); - if (i == end && end != len - 1) { - sprintf(buf, " .."); - break; - } - } -} - -void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) -{ - int i = 0; - - if (rw & REQ_FLUSH) - rwbs[i++] = 'F'; - - if (rw & WRITE) - rwbs[i++] = 'W'; - else if (rw & REQ_DISCARD) - rwbs[i++] = 'D'; - else if (bytes) - rwbs[i++] = 'R'; - else - rwbs[i++] = 'N'; - - if (rw & REQ_FUA) - rwbs[i++] = 'F'; - if (rw & REQ_RAHEAD) - rwbs[i++] = 'A'; - if (rw & REQ_SYNC) - rwbs[i++] = 'S'; - if (rw & REQ_META) - rwbs[i++] = 'M'; - if (rw & REQ_SECURE) - rwbs[i++] = 'E'; - - rwbs[i] = '\0'; -} - -#endif /* CONFIG_EVENT_TRACING */ - -/* - * Infrastructure for profiling code inserted by 'gcc -pg'. - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2004-2008 Ingo Molnar - * - * Originally ported from the -rt patch by: - * Copyright (C) 2007 Arnaldo Carvalho de Melo - * - * Based on code in the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include "trace_output.h" -#include "trace_stat.h" - -#define FTRACE_WARN_ON(cond) \ - ({ \ - int ___r = cond; \ - if (WARN_ON(___r)) \ - ftrace_kill(); \ - ___r; \ - }) - -#define FTRACE_WARN_ON_ONCE(cond) \ - ({ \ - int ___r = cond; \ - if (WARN_ON_ONCE(___r)) \ - ftrace_kill(); \ - ___r; \ - }) - -/* hash bits for specific function selection */ -#define FTRACE_HASH_BITS 7 -#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) -#define FTRACE_HASH_DEFAULT_BITS 10 -#define FTRACE_HASH_MAX_BITS 12 - -/* ftrace_enabled is a method to turn ftrace on or off */ -int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; - -/* Quick disabling of function tracer. */ -int function_trace_stop; - -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -/* - * ftrace_disabled is set when an anomaly is discovered. - * ftrace_disabled is much stronger than ftrace_enabled. - */ -static int ftrace_disabled __read_mostly; - -static DEFINE_MUTEX(ftrace_lock); - -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; -ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; -ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; -static struct ftrace_ops global_ops; - -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); - -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) -{ - struct ftrace_ops *op; - - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) - return; - - trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { - op->func(ip, parent_ip); - op = rcu_dereference_raw(op->next); /*see above*/ - }; - trace_recursion_clear(TRACE_GLOBAL_BIT); -} - -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) -{ - if (!test_tsk_trace_trace(current)) - return; - - ftrace_pid_function(ip, parent_ip); -} - -static void set_ftrace_pid_function(ftrace_func_t func) -{ - /* do not set ftrace_pid_function to itself! */ - if (func != ftrace_pid_func) - ftrace_pid_function = func; -} - -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; - ftrace_pid_function = ftrace_stub; -} - -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) -{ - if (function_trace_stop) - return; - - __ftrace_trace_function(ip, parent_ip); -} -#endif - -static void update_global_ops(void) -{ - ftrace_func_t func; - - /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. - */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) - func = ftrace_global_list->func; - else - func = ftrace_global_list_func; - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - global_ops.func = func; -} - -static void update_ftrace_function(void) -{ - ftrace_func_t func; - - update_global_ops(); - - /* - * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly - */ - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) - func = ftrace_ops_list->func; - else - func = ftrace_ops_list_func; - -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = ftrace_test_stop_func; -#endif -} - -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) -{ - ops->next = *list; - /* - * We are entering ops into the list but another - * CPU might be walking that list. We need to make sure - * the ops->next pointer is valid before another CPU sees - * the ops pointer included into the list. - */ - rcu_assign_pointer(*list, ops); -} - -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) -{ - struct ftrace_ops **p; - - /* - * If we are removing the last function, then simply point - * to the ftrace_stub. - */ - if (*list == ops && ops->next == &ftrace_list_end) { - *list = &ftrace_list_end; - return 0; - } - - for (p = list; *p != &ftrace_list_end; p = &(*p)->next) - if (*p == ops) - break; - - if (*p != ops) - return -1; - - *p = (*p)->next; - return 0; -} - -static int __register_ftrace_function(struct ftrace_ops *ops) -{ - if (ftrace_disabled) - return -ENODEV; - - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) - return -EBUSY; - - if (!core_kernel_data((unsigned long)ops)) - ops->flags |= FTRACE_OPS_FL_DYNAMIC; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); - } else - add_ftrace_ops(&ftrace_ops_list, ops); - - if (ftrace_enabled) - update_ftrace_function(); - - return 0; -} - -static int __unregister_ftrace_function(struct ftrace_ops *ops) -{ - int ret; - - if (ftrace_disabled) - return -ENODEV; - - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) - return -EBUSY; - - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } else - ret = remove_ftrace_ops(&ftrace_ops_list, ops); - - if (ret < 0) - return ret; - - if (ftrace_enabled) - update_ftrace_function(); - - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); - - return 0; -} - -static void ftrace_update_pid_func(void) -{ - /* Only do something if we are tracing something */ - if (ftrace_trace_function == ftrace_stub) - return; - - update_ftrace_function(); -} - -#ifdef CONFIG_FUNCTION_PROFILER -struct ftrace_profile { - struct hlist_node node; - unsigned long ip; - unsigned long counter; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - unsigned long long time; - unsigned long long time_squared; -#endif -}; - -struct ftrace_profile_page { - struct ftrace_profile_page *next; - unsigned long index; - struct ftrace_profile records[]; -}; - -struct ftrace_profile_stat { - atomic_t disabled; - struct hlist_head *hash; - struct ftrace_profile_page *pages; - struct ftrace_profile_page *start; - struct tracer_stat stat; -}; - -#define PROFILE_RECORDS_SIZE \ - (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) - -#define PROFILES_PER_PAGE \ - (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) - -static int ftrace_profile_bits __read_mostly; -static int ftrace_profile_enabled __read_mostly; - -/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ -static DEFINE_MUTEX(ftrace_profile_lock); - -static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); - -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ - -static void * -function_stat_next(void *v, int idx) -{ - struct ftrace_profile *rec = v; - struct ftrace_profile_page *pg; - - pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); - - again: - if (idx != 0) - rec++; - - if ((void *)rec >= (void *)&pg->records[pg->index]) { - pg = pg->next; - if (!pg) - return NULL; - rec = &pg->records[0]; - if (!rec->counter) - goto again; - } - - return rec; -} - -static void *function_stat_start(struct tracer_stat *trace) -{ - struct ftrace_profile_stat *stat = - container_of(trace, struct ftrace_profile_stat, stat); - - if (!stat || !stat->start) - return NULL; - - return function_stat_next(&stat->start->records[0], 0); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) -{ - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; - - if (a->time < b->time) - return -1; - if (a->time > b->time) - return 1; - else - return 0; -} -#else -/* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) -{ - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; - - if (a->counter < b->counter) - return -1; - if (a->counter > b->counter) - return 1; - else - return 0; -} -#endif - -static int function_stat_headers(struct seq_file *m) -{ -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function " - "Hit Time Avg s^2\n" - " -------- " - "--- ---- --- ---\n"); -#else - seq_printf(m, " Function Hit\n" - " -------- ---\n"); -#endif - return 0; -} - -static int function_stat_show(struct seq_file *m, void *v) -{ - struct ftrace_profile *rec = v; - char str[KSYM_SYMBOL_LEN]; - int ret = 0; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - static struct trace_seq s; - unsigned long long avg; - unsigned long long stddev; -#endif - mutex_lock(&ftrace_profile_lock); - - /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } - - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, " %-30.30s %10lu", str, rec->counter); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " "); - avg = rec->time; - do_div(avg, rec->counter); - - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - stddev = rec->time_squared - rec->counter * avg * avg; - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - do_div(stddev, (rec->counter - 1) * 1000); - } - - trace_seq_init(&s); - trace_print_graph_duration(rec->time, &s); - trace_seq_puts(&s, " "); - trace_print_graph_duration(avg, &s); - trace_seq_puts(&s, " "); - trace_print_graph_duration(stddev, &s); - trace_print_seq(m, &s); -#endif - seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - - return ret; -} - -static void ftrace_profile_reset(struct ftrace_profile_stat *stat) -{ - struct ftrace_profile_page *pg; - - pg = stat->pages = stat->start; - - while (pg) { - memset(pg->records, 0, PROFILE_RECORDS_SIZE); - pg->index = 0; - pg = pg->next; - } - - memset(stat->hash, 0, - FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); -} - -int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) -{ - struct ftrace_profile_page *pg; - int functions; - int pages; - int i; - - /* If we already allocated, do nothing */ - if (stat->pages) - return 0; - - stat->pages = (void *)get_zeroed_page(GFP_KERNEL); - if (!stat->pages) - return -ENOMEM; - -#ifdef CONFIG_DYNAMIC_FTRACE - functions = ftrace_update_tot_cnt; -#else - /* - * We do not know the number of functions that exist because - * dynamic tracing is what counts them. With past experience - * we have around 20K functions. That should be more than enough. - * It is highly unlikely we will execute every function in - * the kernel. - */ - functions = 20000; -#endif - - pg = stat->start = stat->pages; - - pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); - - for (i = 0; i < pages; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); - if (!pg->next) - goto out_free; - pg = pg->next; - } - - return 0; - - out_free: - pg = stat->start; - while (pg) { - unsigned long tmp = (unsigned long)pg; - - pg = pg->next; - free_page(tmp); - } - - free_page((unsigned long)stat->pages); - stat->pages = NULL; - stat->start = NULL; - - return -ENOMEM; -} - -static int ftrace_profile_init_cpu(int cpu) -{ - struct ftrace_profile_stat *stat; - int size; - - stat = &per_cpu(ftrace_profile_stats, cpu); - - if (stat->hash) { - /* If the profile is already created, simply reset it */ - ftrace_profile_reset(stat); - return 0; - } - - /* - * We are profiling all functions, but usually only a few thousand - * functions are hit. We'll make a hash of 1024 items. - */ - size = FTRACE_PROFILE_HASH_SIZE; - - stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); - - if (!stat->hash) - return -ENOMEM; - - if (!ftrace_profile_bits) { - size--; - - for (; size; size >>= 1) - ftrace_profile_bits++; - } - - /* Preallocate the function profiling pages */ - if (ftrace_profile_pages_init(stat) < 0) { - kfree(stat->hash); - stat->hash = NULL; - return -ENOMEM; - } - - return 0; -} - -static int ftrace_profile_init(void) -{ - int cpu; - int ret = 0; - - for_each_online_cpu(cpu) { - ret = ftrace_profile_init_cpu(cpu); - if (ret) - break; - } - - return ret; -} - -/* interrupts must be disabled */ -static struct ftrace_profile * -ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) -{ - struct ftrace_profile *rec; - struct hlist_head *hhd; - struct hlist_node *n; - unsigned long key; - - key = hash_long(ip, ftrace_profile_bits); - hhd = &stat->hash[key]; - - if (hlist_empty(hhd)) - return NULL; - - hlist_for_each_entry_rcu(rec, n, hhd, node) { - if (rec->ip == ip) - return rec; - } - - return NULL; -} - -static void ftrace_add_profile(struct ftrace_profile_stat *stat, - struct ftrace_profile *rec) -{ - unsigned long key; - - key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &stat->hash[key]); -} - -/* - * The memory is already allocated, this simply finds a new record to use. - */ -static struct ftrace_profile * -ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) -{ - struct ftrace_profile *rec = NULL; - - /* prevent recursion (from NMIs) */ - if (atomic_inc_return(&stat->disabled) != 1) - goto out; - - /* - * Try to find the function again since an NMI - * could have added it - */ - rec = ftrace_find_profiled_func(stat, ip); - if (rec) - goto out; - - if (stat->pages->index == PROFILES_PER_PAGE) { - if (!stat->pages->next) - goto out; - stat->pages = stat->pages->next; - } - - rec = &stat->pages->records[stat->pages->index++]; - rec->ip = ip; - ftrace_add_profile(stat, rec); - - out: - atomic_dec(&stat->disabled); - - return rec; -} - -static void -function_profile_call(unsigned long ip, unsigned long parent_ip) -{ - struct ftrace_profile_stat *stat; - struct ftrace_profile *rec; - unsigned long flags; - - if (!ftrace_profile_enabled) - return; - - local_irq_save(flags); - - stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash || !ftrace_profile_enabled) - goto out; - - rec = ftrace_find_profiled_func(stat, ip); - if (!rec) { - rec = ftrace_profile_alloc(stat, ip); - if (!rec) - goto out; - } - - rec->counter++; - out: - local_irq_restore(flags); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int profile_graph_entry(struct ftrace_graph_ent *trace) -{ - function_profile_call(trace->func, 0); - return 1; -} - -static void profile_graph_return(struct ftrace_graph_ret *trace) -{ - struct ftrace_profile_stat *stat; - unsigned long long calltime; - struct ftrace_profile *rec; - unsigned long flags; - - local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash || !ftrace_profile_enabled) - goto out; - - /* If the calltime was zero'd ignore it */ - if (!trace->calltime) - goto out; - - calltime = trace->rettime - trace->calltime; - - if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { - int index; - - index = trace->depth; - - /* Append this call time to the parent time to subtract */ - if (index) - current->ret_stack[index - 1].subtime += calltime; - - if (current->ret_stack[index].subtime < calltime) - calltime -= current->ret_stack[index].subtime; - else - calltime = 0; - } - - rec = ftrace_find_profiled_func(stat, trace->func); - if (rec) { - rec->time += calltime; - rec->time_squared += calltime * calltime; - } - - out: - local_irq_restore(flags); -} - -static int register_ftrace_profiler(void) -{ - return register_ftrace_graph(&profile_graph_return, - &profile_graph_entry); -} - -static void unregister_ftrace_profiler(void) -{ - unregister_ftrace_graph(); -} -#else -static struct ftrace_ops ftrace_profile_ops __read_mostly = { - .func = function_profile_call, -}; - -static int register_ftrace_profiler(void) -{ - return register_ftrace_function(&ftrace_profile_ops); -} - -static void unregister_ftrace_profiler(void) -{ - unregister_ftrace_function(&ftrace_profile_ops); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -static ssize_t -ftrace_profile_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - val = !!val; - - mutex_lock(&ftrace_profile_lock); - if (ftrace_profile_enabled ^ val) { - if (val) { - ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } - - ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } - ftrace_profile_enabled = 1; - } else { - ftrace_profile_enabled = 0; - /* - * unregister_ftrace_profiler calls stop_machine - * so this acts like an synchronize_sched. - */ - unregister_ftrace_profiler(); - } - } - out: - mutex_unlock(&ftrace_profile_lock); - - *ppos += cnt; - - return cnt; -} - -static ssize_t -ftrace_profile_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; /* big enough to hold a number */ - int r; - - r = sprintf(buf, "%u\n", ftrace_profile_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static const struct file_operations ftrace_profile_fops = { - .open = tracing_open_generic, - .read = ftrace_profile_read, - .write = ftrace_profile_write, - .llseek = default_llseek, -}; - -/* used to initialize the real stat files */ -static struct tracer_stat function_stats __initdata = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) -{ - struct ftrace_profile_stat *stat; - struct dentry *entry; - char *name; - int ret; - int cpu; - - for_each_possible_cpu(cpu) { - stat = &per_cpu(ftrace_profile_stats, cpu); - - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); - if (!name) { - /* - * The files created are permanent, if something happens - * we still do not free memory. - */ - WARN(1, - "Could not allocate stat file for cpu %d\n", - cpu); - return; - } - stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); - stat->stat.name = name; - ret = register_stat_tracer(&stat->stat); - if (ret) { - WARN(1, - "Could not register function stat for cpu %d\n", - cpu); - kfree(name); - return; - } - } - - entry = debugfs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'function_profile_enabled' entry\n"); -} - -#else /* CONFIG_FUNCTION_PROFILER */ -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) -{ -} -#endif /* CONFIG_FUNCTION_PROFILER */ - -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct rcu_head rcu; -}; - -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; -}; - -struct ftrace_hash { - unsigned long size_bits; - struct hlist_head *buckets; - unsigned long count; - struct rcu_head rcu; -}; - -/* - * We make these constant because no one should touch them, - * but they are used as the default "empty hash", to avoid allocating - * it all the time. These are in a read only section such that if - * anyone does try to modify it, it will cause an exception. - */ -static const struct hlist_head empty_buckets[1]; -static const struct ftrace_hash empty_hash = { - .buckets = (struct hlist_head *)empty_buckets, -}; -#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) - -static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, -}; - -static DEFINE_MUTEX(ftrace_regex_lock); - -struct ftrace_page { - struct ftrace_page *next; - struct dyn_ftrace *records; - int index; - int size; -}; - -static struct ftrace_page *ftrace_new_pgs; - -#define ENTRY_SIZE sizeof(struct dyn_ftrace) -#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -static bool ftrace_hash_empty(struct ftrace_hash *hash) -{ - return !hash || !hash->count; -} - -static struct ftrace_func_entry * -ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) -{ - unsigned long key; - struct ftrace_func_entry *entry; - struct hlist_head *hhd; - struct hlist_node *n; - - if (ftrace_hash_empty(hash)) - return NULL; - - if (hash->size_bits > 0) - key = hash_long(ip, hash->size_bits); - else - key = 0; - - hhd = &hash->buckets[key]; - - hlist_for_each_entry_rcu(entry, n, hhd, hlist) { - if (entry->ip == ip) - return entry; - } - return NULL; -} - -static void __add_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) -{ - struct hlist_head *hhd; - unsigned long key; - - if (hash->size_bits) - key = hash_long(entry->ip, hash->size_bits); - else - key = 0; - - hhd = &hash->buckets[key]; - hlist_add_head(&entry->hlist, hhd); - hash->count++; -} - -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) -{ - struct ftrace_func_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - entry->ip = ip; - __add_hash_entry(hash, entry); - - return 0; -} - -static void -free_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) -{ - hlist_del(&entry->hlist); - kfree(entry); - hash->count--; -} - -static void -remove_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) -{ - hlist_del(&entry->hlist); - hash->count--; -} - -static void ftrace_hash_clear(struct ftrace_hash *hash) -{ - struct hlist_head *hhd; - struct hlist_node *tp, *tn; - struct ftrace_func_entry *entry; - int size = 1 << hash->size_bits; - int i; - - if (!hash->count) - return; - - for (i = 0; i < size; i++) { - hhd = &hash->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) - free_hash_entry(hash, entry); - } - FTRACE_WARN_ON(hash->count); -} - -static void free_ftrace_hash(struct ftrace_hash *hash) -{ - if (!hash || hash == EMPTY_HASH) - return; - ftrace_hash_clear(hash); - kfree(hash->buckets); - kfree(hash); -} - -static void __free_ftrace_hash_rcu(struct rcu_head *rcu) -{ - struct ftrace_hash *hash; - - hash = container_of(rcu, struct ftrace_hash, rcu); - free_ftrace_hash(hash); -} - -static void free_ftrace_hash_rcu(struct ftrace_hash *hash) -{ - if (!hash || hash == EMPTY_HASH) - return; - call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); -} - -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) -{ - struct ftrace_hash *hash; - int size; - - hash = kzalloc(sizeof(*hash), GFP_KERNEL); - if (!hash) - return NULL; - - size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); - - if (!hash->buckets) { - kfree(hash); - return NULL; - } - - hash->size_bits = size_bits; - - return hash; -} - -static struct ftrace_hash * -alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) -{ - struct ftrace_func_entry *entry; - struct ftrace_hash *new_hash; - struct hlist_node *tp; - int size; - int ret; - int i; - - new_hash = alloc_ftrace_hash(size_bits); - if (!new_hash) - return NULL; - - /* Empty hash? */ - if (ftrace_hash_empty(hash)) - return new_hash; - - size = 1 << hash->size_bits; - for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { - ret = add_hash_entry(new_hash, entry->ip); - if (ret < 0) - goto free_hash; - } - } - - FTRACE_WARN_ON(new_hash->count != hash->count); - - return new_hash; - - free_hash: - free_ftrace_hash(new_hash); - return NULL; -} - -static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); -static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); - -static int -ftrace_hash_move(struct ftrace_ops *ops, int enable, - struct ftrace_hash **dst, struct ftrace_hash *src) -{ - struct ftrace_func_entry *entry; - struct hlist_node *tp, *tn; - struct hlist_head *hhd; - struct ftrace_hash *old_hash; - struct ftrace_hash *new_hash; - unsigned long key; - int size = src->count; - int bits = 0; - int ret; - int i; - - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(ops, enable); - - /* - * If the new source is empty, just free dst and assign it - * the empty_hash. - */ - if (!src->count) { - free_ftrace_hash_rcu(*dst); - rcu_assign_pointer(*dst, EMPTY_HASH); - /* still need to update the function records */ - ret = 0; - goto out; - } - - /* - * Make the hash size about 1/2 the # found - */ - for (size /= 2; size; size >>= 1) - bits++; - - /* Don't allocate too much */ - if (bits > FTRACE_HASH_MAX_BITS) - bits = FTRACE_HASH_MAX_BITS; - - ret = -ENOMEM; - new_hash = alloc_ftrace_hash(bits); - if (!new_hash) - goto out; - - size = 1 << src->size_bits; - for (i = 0; i < size; i++) { - hhd = &src->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { - if (bits > 0) - key = hash_long(entry->ip, bits); - else - key = 0; - remove_hash_entry(src, entry); - __add_hash_entry(new_hash, entry); - } - } - - old_hash = *dst; - rcu_assign_pointer(*dst, new_hash); - free_ftrace_hash_rcu(old_hash); - - ret = 0; - out: - /* - * Enable regardless of ret: - * On success, we enable the new hash. - * On failure, we re-enable the original hash. - */ - ftrace_hash_rec_enable(ops, enable); - - return ret; -} - -/* - * Test the hashes for this ops to see if we want to call - * the ops->func or not. - * - * It's a match if the ip is in the ops->filter_hash or - * the filter_hash does not exist or is empty, - * AND - * the ip is not in the ops->notrace_hash. - * - * This needs to be called with preemption disabled as - * the hashes are freed with call_rcu_sched(). - */ -static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) -{ - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; - int ret; - - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); - - if ((ftrace_hash_empty(filter_hash) || - ftrace_lookup_ip(filter_hash, ip)) && - (ftrace_hash_empty(notrace_hash) || - !ftrace_lookup_ip(notrace_hash, ip))) - ret = 1; - else - ret = 0; - - return ret; -} - -/* - * This is a double for. Do not use 'break' to break out of the loop, - * you must use a goto. - */ -#define do_for_each_ftrace_rec(pg, rec) \ - for (pg = ftrace_pages_start; pg; pg = pg->next) { \ - int _____i; \ - for (_____i = 0; _____i < pg->index; _____i++) { \ - rec = &pg->records[_____i]; - -#define while_for_each_ftrace_rec() \ - } \ - } - - -static int ftrace_cmp_recs(const void *a, const void *b) -{ - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; - - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) - return -1; - return 0; -} - -/** - * ftrace_location - return true if the ip giving is a traced location - * @ip: the instruction pointer to check - * - * Returns 1 if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. - */ -int ftrace_location(unsigned long ip) -{ - struct ftrace_page *pg; - struct dyn_ftrace *rec; - struct dyn_ftrace key; - - key.ip = ip; - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return 1; - } - - return 0; -} - -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, - int filter_hash, - bool inc) -{ - struct ftrace_hash *hash; - struct ftrace_hash *other_hash; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int count = 0; - int all = 0; - - /* Only update if the ops has been registered */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; - - /* - * In the filter_hash case: - * If the count is zero, we update all records. - * Otherwise we just update the items in the hash. - * - * In the notrace_hash case: - * We enable the update in the hash. - * As disabling notrace means enabling the tracing, - * and enabling notrace means disabling, the inc variable - * gets inversed. - */ - if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; - if (ftrace_hash_empty(hash)) - all = 1; - } else { - inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; - /* - * If the notrace hash has no items, - * then there's nothing to do. - */ - if (ftrace_hash_empty(hash)) - return; - } - - do_for_each_ftrace_rec(pg, rec) { - int in_other_hash = 0; - int in_hash = 0; - int match = 0; - - if (all) { - /* - * Only the filter_hash affects all records. - * Update if the record is not in the notrace hash. - */ - if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) - match = 1; - } else { - in_hash = !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); - - /* - * - */ - if (filter_hash && in_hash && !in_other_hash) - match = 1; - else if (!filter_hash && in_hash && - (in_other_hash || ftrace_hash_empty(other_hash))) - match = 1; - } - if (!match) - continue; - - if (inc) { - rec->flags++; - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) - return; - } else { - if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) - return; - rec->flags--; - } - count++; - /* Shortcut, if we handled all records, we are done. */ - if (!all && count == hash->count) - return; - } while_for_each_ftrace_rec(); -} - -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, - int filter_hash) -{ - __ftrace_hash_rec_update(ops, filter_hash, 0); -} - -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, - int filter_hash) -{ - __ftrace_hash_rec_update(ops, filter_hash, 1); -} - -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ - if (ftrace_pages->index == ftrace_pages->size) { - /* We should have allocated enough */ - if (WARN_ON(!ftrace_pages->next)) - return NULL; - ftrace_pages = ftrace_pages->next; - } - - return &ftrace_pages->records[ftrace_pages->index++]; -} - -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) -{ - struct dyn_ftrace *rec; - - if (ftrace_disabled) - return NULL; - - rec = ftrace_alloc_dyn_node(ip); - if (!rec) - return NULL; - - rec->ip = ip; - - return rec; -} - -static void print_ip_ins(const char *fmt, unsigned char *p) -{ - int i; - - printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); -} - -/** - * ftrace_bug - report and shutdown function tracer - * @failed: The failed type (EFAULT, EINVAL, EPERM) - * @ip: The address that failed - * - * The arch code that enables or disables the function tracing - * can call ftrace_bug() when it has detected a problem in - * modifying the code. @failed should be one of either: - * EFAULT - if the problem happens on reading the @ip address - * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address - */ -void ftrace_bug(int failed, unsigned long ip) -{ - switch (failed) { - case -EFAULT: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); - break; - case -EINVAL: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace failed to modify "); - print_ip_sym(ip); - print_ip_ins(" actual: ", (unsigned char *)ip); - printk(KERN_CONT "\n"); - break; - case -EPERM: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on writing "); - print_ip_sym(ip); - break; - default: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); - } -} - - -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip <= (unsigned long)end && - rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) - return 1; - } while_for_each_ftrace_rec(); - return 0; -} - -static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) -{ - unsigned long flag = 0UL; - - /* - * If we are updating calls: - * - * If the record has a ref count, then we need to enable it - * because someone is using it. - * - * Otherwise we make sure its disabled. - * - * If we are disabling calls, then disable all records that - * are enabled. - */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) - flag = FTRACE_FL_ENABLED; - - /* If the state of this record hasn't changed, then do nothing */ - if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return FTRACE_UPDATE_IGNORE; - - if (flag) { - if (update) - rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; - } - - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; - - return FTRACE_UPDATE_MAKE_NOP; -} - -/** - * ftrace_update_record, set a record that now is tracing or not - * @rec: the record to update - * @enable: set to 1 if the record is tracing, zero to force disable - * - * The records that represent all functions that can be traced need - * to be updated when tracing has been enabled. - */ -int ftrace_update_record(struct dyn_ftrace *rec, int enable) -{ - return ftrace_check_record(rec, enable, 1); -} - -/** - * ftrace_test_record, check if the record has been enabled or not - * @rec: the record to test - * @enable: set to 1 to check if enabled, 0 if it is disabled - * - * The arch code may need to test if a record is already set to - * tracing to determine how to modify the function code that it - * represents. - */ -int ftrace_test_record(struct dyn_ftrace *rec, int enable) -{ - return ftrace_check_record(rec, enable, 0); -} - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) -{ - unsigned long ftrace_addr; - int ret; - - ftrace_addr = (unsigned long)FTRACE_ADDR; - - ret = ftrace_update_record(rec, enable); - - switch (ret) { - case FTRACE_UPDATE_IGNORE: - return 0; - - case FTRACE_UPDATE_MAKE_CALL: - return ftrace_make_call(rec, ftrace_addr); - - case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); - } - - return -1; /* unknow ftrace bug */ -} - -static void ftrace_replace_code(int update) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - int failed; - - if (unlikely(ftrace_disabled)) - return; - - do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); - if (failed) { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } - } while_for_each_ftrace_rec(); -} - -struct ftrace_rec_iter { - struct ftrace_page *pg; - int index; -}; - -/** - * ftrace_rec_iter_start, start up iterating over traced functions - * - * Returns an iterator handle that is used to iterate over all - * the records that represent address locations where functions - * are traced. - * - * May return NULL if no records are available. - */ -struct ftrace_rec_iter *ftrace_rec_iter_start(void) -{ - /* - * We only use a single iterator. - * Protected by the ftrace_lock mutex. - */ - static struct ftrace_rec_iter ftrace_rec_iter; - struct ftrace_rec_iter *iter = &ftrace_rec_iter; - - iter->pg = ftrace_pages_start; - iter->index = 0; - - /* Could have empty pages */ - while (iter->pg && !iter->pg->index) - iter->pg = iter->pg->next; - - if (!iter->pg) - return NULL; - - return iter; -} - -/** - * ftrace_rec_iter_next, get the next record to process. - * @iter: The handle to the iterator. - * - * Returns the next iterator after the given iterator @iter. - */ -struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) -{ - iter->index++; - - if (iter->index >= iter->pg->index) { - iter->pg = iter->pg->next; - iter->index = 0; - - /* Could have empty pages */ - while (iter->pg && !iter->pg->index) - iter->pg = iter->pg->next; - } - - if (!iter->pg) - return NULL; - - return iter; -} - -/** - * ftrace_rec_iter_record, get the record at the iterator location - * @iter: The current iterator location - * - * Returns the record that the current @iter is at. - */ -struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) -{ - return &iter->pg->records[iter->index]; -} - -static int -ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) -{ - unsigned long ip; - int ret; - - ip = rec->ip; - - if (unlikely(ftrace_disabled)) - return 0; - - ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); - if (ret) { - ftrace_bug(ret, ip); - return 0; - } - return 1; -} - -/* - * archs can override this function if they must do something - * before the modifying code is performed. - */ -int __weak ftrace_arch_code_modify_prepare(void) -{ - return 0; -} - -/* - * archs can override this function if they must do something - * after the modifying code is performed. - */ -int __weak ftrace_arch_code_modify_post_process(void) -{ - return 0; -} - -static int __ftrace_modify_code(void *data) -{ - int *command = data; - - if (*command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); - - if (*command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); - - if (*command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); - else if (*command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); - - return 0; -} - -/** - * ftrace_run_stop_machine, go back to the stop machine method - * @command: The command to tell ftrace what to do - * - * If an arch needs to fall back to the stop machine method, the - * it can call this function. - */ -void ftrace_run_stop_machine(int command) -{ - stop_machine(__ftrace_modify_code, &command, NULL); -} - -/** - * arch_ftrace_update_code, modify the code to trace or not trace - * @command: The command that needs to be done - * - * Archs can override this function if it does not need to - * run stop_machine() to modify code. - */ -void __weak arch_ftrace_update_code(int command) -{ - ftrace_run_stop_machine(command); -} - -static void ftrace_run_update_code(int command) -{ - int ret; - - ret = ftrace_arch_code_modify_prepare(); - FTRACE_WARN_ON(ret); - if (ret) - return; - /* - * Do not call function tracer while we update the code. - * We are in stop machine. - */ - function_trace_stop++; - - /* - * By default we use stop_machine() to modify the code. - * But archs can do what ever they want as long as it - * is safe. The stop_machine() is the safest, but also - * produces the most overhead. - */ - arch_ftrace_update_code(command); - -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - - ret = ftrace_arch_code_modify_post_process(); - FTRACE_WARN_ON(ret); -} - -static ftrace_func_t saved_ftrace_func; -static int ftrace_start_up; -static int global_start_up; - -static void ftrace_startup_enable(int command) -{ - if (saved_ftrace_func != ftrace_trace_function) { - saved_ftrace_func = ftrace_trace_function; - command |= FTRACE_UPDATE_TRACE_FUNC; - } - - if (!command || !ftrace_enabled) - return; - - ftrace_run_update_code(command); -} - -static int ftrace_startup(struct ftrace_ops *ops, int command) -{ - bool hash_enable = true; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; - - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); - - ftrace_startup_enable(command); - - return 0; -} - -static void ftrace_shutdown(struct ftrace_ops *ops, int command) -{ - bool hash_disable = true; - - if (unlikely(ftrace_disabled)) - return; - - ftrace_start_up--; - /* - * Just warn in case of unbalance, no need to kill ftrace, it's not - * critical but the ftrace_call callers may be never nopped again after - * further ftrace uses. - */ - WARN_ON_ONCE(ftrace_start_up < 0); - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); - - if (ops != &global_ops || !global_start_up) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - - command |= FTRACE_UPDATE_CALLS; - - if (saved_ftrace_func != ftrace_trace_function) { - saved_ftrace_func = ftrace_trace_function; - command |= FTRACE_UPDATE_TRACE_FUNC; - } - - if (!command || !ftrace_enabled) - return; - - ftrace_run_update_code(command); -} - -static void ftrace_startup_sysctl(void) -{ - if (unlikely(ftrace_disabled)) - return; - - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); -} - -static void ftrace_shutdown_sysctl(void) -{ - if (unlikely(ftrace_disabled)) - return; - - /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_DISABLE_CALLS); -} - -static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; -unsigned long ftrace_update_tot_cnt; - -static int ops_traces_mod(struct ftrace_ops *ops) -{ - struct ftrace_hash *hash; - - hash = ops->filter_hash; - return ftrace_hash_empty(hash); -} - -static int ftrace_update_code(struct module *mod) -{ - struct ftrace_page *pg; - struct dyn_ftrace *p; - cycle_t start, stop; - unsigned long ref = 0; - int i; - - /* - * When adding a module, we need to check if tracers are - * currently enabled and if they are set to trace all functions. - * If they are, we need to enable the module functions as well - * as update the reference counts for those function records. - */ - if (mod) { - struct ftrace_ops *ops; - - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && - ops_traces_mod(ops)) - ref++; - } - } - - start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - - for (pg = ftrace_new_pgs; pg; pg = pg->next) { - - for (i = 0; i < pg->index; i++) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; - - p = &pg->records[i]; - p->flags = ref; - - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) - break; - - ftrace_update_cnt++; - - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p->ip); - } - } - } - - ftrace_new_pgs = NULL; - - stop = ftrace_now(raw_smp_processor_id()); - ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; - - return 0; -} - -static int ftrace_allocate_records(struct ftrace_page *pg, int count) -{ - int order; - int cnt; - - if (WARN_ON(!count)) - return -EINVAL; - - order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - - /* - * We want to fill as much as possible. No more than a page - * may be empty. - */ - while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) - order--; - - again: - pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - - if (!pg->records) { - /* if we can't allocate this size, try something smaller */ - if (!order) - return -ENOMEM; - order >>= 1; - goto again; - } - - cnt = (PAGE_SIZE << order) / ENTRY_SIZE; - pg->size = cnt; - - if (cnt > count) - cnt = count; - - return cnt; -} - -static struct ftrace_page * -ftrace_allocate_pages(unsigned long num_to_init) -{ - struct ftrace_page *start_pg; - struct ftrace_page *pg; - int order; - int cnt; - - if (!num_to_init) - return 0; - - start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); - if (!pg) - return NULL; - - /* - * Try to allocate as much as possible in one continues - * location that fills in all of the space. We want to - * waste as little space as possible. - */ - for (;;) { - cnt = ftrace_allocate_records(pg, num_to_init); - if (cnt < 0) - goto free_pages; - - num_to_init -= cnt; - if (!num_to_init) - break; - - pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); - if (!pg->next) - goto free_pages; - - pg = pg->next; - } - - return start_pg; - - free_pages: - while (start_pg) { - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - start_pg = pg->next; - kfree(pg); - pg = start_pg; - } - pr_info("ftrace: FAILED to allocate memory for functions\n"); - return NULL; -} - -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) -{ - int cnt; - - if (!num_to_init) { - pr_info("ftrace: No functions to be traced?\n"); - return -1; - } - - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); - - return 0; -} - -#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ - -struct ftrace_iterator { - loff_t pos; - loff_t func_pos; - struct ftrace_page *pg; - struct dyn_ftrace *func; - struct ftrace_func_probe *probe; - struct trace_parser parser; - struct ftrace_hash *hash; - struct ftrace_ops *ops; - int hidx; - int idx; - unsigned flags; -}; - -static void * -t_hash_next(struct seq_file *m, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = NULL; - struct hlist_head *hhd; - - (*pos)++; - iter->pos = *pos; - - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) - return NULL; - - hhd = &ftrace_func_hash[iter->hidx]; - - if (hlist_empty(hhd)) { - iter->hidx++; - hnd = NULL; - goto retry; - } - - if (!hnd) - hnd = hhd->first; - else { - hnd = hnd->next; - if (!hnd) { - iter->hidx++; - goto retry; - } - } - - if (WARN_ON_ONCE(!hnd)) - return NULL; - - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); - - return iter; -} - -static void *t_hash_start(struct seq_file *m, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - void *p = NULL; - loff_t l; - - if (!(iter->flags & FTRACE_ITER_DO_HASH)) - return NULL; - - if (iter->func_pos > *pos) - return NULL; - - iter->hidx = 0; - for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); - if (!p) - break; - } - if (!p) - return NULL; - - /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; - - return iter; -} - -static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) -{ - struct ftrace_func_probe *rec; - - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) - return -EIO; - - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, rec->data); - - seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); - - if (rec->data) - seq_printf(m, ":%p", rec->data); - seq_putc(m, '\n'); - - return 0; -} - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; - struct dyn_ftrace *rec = NULL; - - if (unlikely(ftrace_disabled)) - return NULL; - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); - - (*pos)++; - iter->pos = iter->func_pos = *pos; - - if (iter->flags & FTRACE_ITER_PRINTALL) - return t_hash_start(m, pos); - - retry: - if (iter->idx >= iter->pg->index) { - if (iter->pg->next) { - iter->pg = iter->pg->next; - iter->idx = 0; - goto retry; - } - } else { - rec = &iter->pg->records[iter->idx++]; - if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || - - ((iter->flags & FTRACE_ITER_ENABLED) && - !(rec->flags & ~FTRACE_FL_MASK))) { - - rec = NULL; - goto retry; - } - } - - if (!rec) - return t_hash_start(m, pos); - - iter->func = rec; - - return iter; -} - -static void reset_iter_read(struct ftrace_iterator *iter) -{ - iter->pos = 0; - iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; - void *p = NULL; - loff_t l; - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - return NULL; - - /* - * If an lseek was done, then reset and start from beginning. - */ - if (*pos < iter->pos) - reset_iter_read(iter); - - /* - * For set_ftrace_filter reading, if we have the filter - * off, we can short cut and just print out that all - * functions are enabled. - */ - if (iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) { - if (*pos > 0) - return t_hash_start(m, pos); - iter->flags |= FTRACE_ITER_PRINTALL; - /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; - return iter; - } - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); - - /* - * Unfortunately, we need to restart at ftrace_pages_start - * every time we let go of the ftrace_mutex. This is because - * those pointers can change without the lock. - */ - iter->pg = ftrace_pages_start; - iter->idx = 0; - for (l = 0; l <= *pos; ) { - p = t_next(m, p, &l); - if (!p) - break; - } - - if (!p) - return t_hash_start(m, pos); - - return iter; -} - -static void t_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&ftrace_lock); -} - -static int t_show(struct seq_file *m, void *v) -{ - struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec; - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); - - if (iter->flags & FTRACE_ITER_PRINTALL) { - seq_printf(m, "#### all functions enabled ####\n"); - return 0; - } - - rec = iter->func; - - if (!rec) - return 0; - - seq_printf(m, "%ps", (void *)rec->ip); - if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); - seq_printf(m, "\n"); - - return 0; -} - -static const struct seq_operations show_ftrace_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int -ftrace_avail_open(struct inode *inode, struct file *file) -{ - struct ftrace_iterator *iter; - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); - } - - return ret; -} - -static int -ftrace_enabled_open(struct inode *inode, struct file *file) -{ - struct ftrace_iterator *iter; - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); - } - - return ret; -} - -static void ftrace_filter_reset(struct ftrace_hash *hash) -{ - mutex_lock(&ftrace_lock); - ftrace_hash_clear(hash); - mutex_unlock(&ftrace_lock); -} - -/** - * ftrace_regex_open - initialize function tracer filter files - * @ops: The ftrace_ops that hold the hash filters - * @flag: The type of filter to process - * @inode: The inode, usually passed in to your open routine - * @file: The file, usually passed in to your open routine - * - * ftrace_regex_open() initializes the filter files for the - * @ops. Depending on @flag it may process the filter hash or - * the notrace hash of @ops. With this called from the open - * routine, you can use ftrace_filter_write() for the write - * routine if @flag has FTRACE_ITER_FILTER set, or - * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and - * release must call ftrace_regex_release(). - */ -int -ftrace_regex_open(struct ftrace_ops *ops, int flag, - struct inode *inode, struct file *file) -{ - struct ftrace_iterator *iter; - struct ftrace_hash *hash; - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { - kfree(iter); - return -ENOMEM; - } - - if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; - else - hash = ops->filter_hash; - - iter->ops = ops; - iter->flags = flag; - - if (file->f_mode & FMODE_WRITE) { - mutex_lock(&ftrace_lock); - iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); - mutex_unlock(&ftrace_lock); - - if (!iter->hash) { - trace_parser_put(&iter->parser); - kfree(iter); - return -ENOMEM; - } - } - - mutex_lock(&ftrace_regex_lock); - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_filter_reset(iter->hash); - - if (file->f_mode & FMODE_READ) { - iter->pg = ftrace_pages_start; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = iter; - } else { - /* Failed */ - free_ftrace_hash(iter->hash); - trace_parser_put(&iter->parser); - kfree(iter); - } - } else - file->private_data = iter; - mutex_unlock(&ftrace_regex_lock); - - return ret; -} - -static int -ftrace_filter_open(struct inode *inode, struct file *file) -{ - return ftrace_regex_open(&global_ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, - inode, file); -} - -static int -ftrace_notrace_open(struct inode *inode, struct file *file) -{ - return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, - inode, file); -} - -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); - else - file->f_pos = ret = 1; - - return ret; -} - -static int ftrace_match(char *str, char *regex, int len, int type) -{ - int matched = 0; - int slen; - - switch (type) { - case MATCH_FULL: - if (strcmp(str, regex) == 0) - matched = 1; - break; - case MATCH_FRONT_ONLY: - if (strncmp(str, regex, len) == 0) - matched = 1; - break; - case MATCH_MIDDLE_ONLY: - if (strstr(str, regex)) - matched = 1; - break; - case MATCH_END_ONLY: - slen = strlen(str); - if (slen >= len && memcmp(str + slen - len, regex, len) == 0) - matched = 1; - break; - } - - return matched; -} - -static int -enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) -{ - struct ftrace_func_entry *entry; - int ret = 0; - - entry = ftrace_lookup_ip(hash, rec->ip); - if (not) { - /* Do nothing if it doesn't exist */ - if (!entry) - return 0; - - free_hash_entry(hash, entry); - } else { - /* Do nothing if it exists */ - if (entry) - return 0; - - ret = add_hash_entry(hash, rec->ip); - } - return ret; -} - -static int -ftrace_match_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) -{ - char str[KSYM_SYMBOL_LEN]; - char *modname; - - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - - if (mod) { - /* module lookup requires matching the module */ - if (!modname || strcmp(modname, mod)) - return 0; - - /* blank search means to match all funcs in the mod */ - if (!len) - return 1; - } - - return ftrace_match(str, regex, len, type); -} - -static int -match_records(struct ftrace_hash *hash, char *buff, - int len, char *mod, int not) -{ - unsigned search_len = 0; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; - int found = 0; - int ret; - - if (len) { - type = filter_parse_regex(buff, len, &search, ¬); - search_len = strlen(search); - } - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; - - do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = enter_record(hash, rec, not); - if (ret < 0) { - found = ret; - goto out_unlock; - } - found = 1; - } - } while_for_each_ftrace_rec(); - out_unlock: - mutex_unlock(&ftrace_lock); - - return found; -} - -static int -ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) -{ - return match_records(hash, buff, len, NULL, 0); -} - -static int -ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) -{ - int not = 0; - - /* blank or '*' mean the same */ - if (strcmp(buff, "*") == 0) - buff[0] = 0; - - /* handle the case of 'dont filter this module' */ - if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { - buff[0] = 0; - not = 1; - } - - return match_records(hash, buff, strlen(buff), mod, not); -} - -/* - * We register the module command as a template to show others how - * to register the a command as well. - */ - -static int -ftrace_mod_callback(struct ftrace_hash *hash, - char *func, char *cmd, char *param, int enable) -{ - char *mod; - int ret = -EINVAL; - - /* - * cmd == 'mod' because we only registered this func - * for the 'mod' ftrace_func_command. - * But if you register one func with multiple commands, - * you can tell which command was used by the cmd - * parameter. - */ - - /* we must have a module name */ - if (!param) - return ret; - - mod = strsep(¶m, ":"); - if (!strlen(mod)) - return ret; - - ret = ftrace_match_module_records(hash, func, mod); - if (!ret) - ret = -EINVAL; - if (ret < 0) - return ret; - - return 0; -} - -static struct ftrace_func_command ftrace_mod_cmd = { - .name = "mod", - .func = ftrace_mod_callback, -}; - -static int __init ftrace_mod_cmd_init(void) -{ - return register_ftrace_command(&ftrace_mod_cmd); -} -device_initcall(ftrace_mod_cmd_init); - -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) -{ - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - struct hlist_node *n; - unsigned long key; - - key = hash_long(ip, FTRACE_HASH_BITS); - - hhd = &ftrace_func_hash[key]; - - if (hlist_empty(hhd)) - return; - - /* - * Disable preemption for these calls to prevent a RCU grace - * period. This syncs the hash iteration and freeing of items - * on the hash. rcu_read_lock is too dangerous here. - */ - preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, n, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, &entry->data); - } - preempt_enable_notrace(); -} - -static struct ftrace_ops trace_probe_ops __read_mostly = -{ - .func = function_trace_probe_call, -}; - -static int ftrace_probe_registered; - -static void __enable_ftrace_function_probe(void) -{ - int ret; - int i; - - if (ftrace_probe_registered) - return; - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - break; - } - /* Nothing registered? */ - if (i == FTRACE_FUNC_HASHSIZE) - return; - - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); - - ftrace_probe_registered = 1; -} - -static void __disable_ftrace_function_probe(void) -{ - int ret; - int i; - - if (!ftrace_probe_registered) - return; - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - return; - } - - /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); - - ftrace_probe_registered = 0; -} - - -static void ftrace_free_entry_rcu(struct rcu_head *rhp) -{ - struct ftrace_func_probe *entry = - container_of(rhp, struct ftrace_func_probe, rcu); - - if (entry->ops->free) - entry->ops->free(&entry->data); - kfree(entry); -} - - -int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - struct ftrace_func_probe *entry; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type, len, not; - unsigned long key; - int count = 0; - char *search; - - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) - return -EINVAL; - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; - - do_for_each_ftrace_rec(pg, rec) { - - if (!ftrace_match_record(rec, NULL, search, len, type)) - continue; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - /* If we did not process any, then return error */ - if (!count) - count = -ENOMEM; - goto out_unlock; - } - - count++; - - entry->data = data; - - /* - * The caller might want to do something special - * for each function we find. We call the callback - * to give the caller an opportunity to do so. - */ - if (ops->callback) { - if (ops->callback(rec->ip, &entry->data) < 0) { - /* caller does not like this func */ - kfree(entry); - continue; - } - } - - entry->ops = ops; - entry->ip = rec->ip; - - key = hash_long(entry->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); - - } while_for_each_ftrace_rec(); - __enable_ftrace_function_probe(); - - out_unlock: - mutex_unlock(&ftrace_lock); - - return count; -} - -enum { - PROBE_TEST_FUNC = 1, - PROBE_TEST_DATA = 2 -}; - -static void -__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data, int flags) -{ - struct ftrace_func_probe *entry; - struct hlist_node *n, *tmp; - char str[KSYM_SYMBOL_LEN]; - int type = MATCH_FULL; - int i, len = 0; - char *search; - - if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) - glob = NULL; - else if (glob) { - int not; - - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) - return; - } - - mutex_lock(&ftrace_lock); - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - - hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { - - /* break up if statements for readability */ - if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) - continue; - - if ((flags & PROBE_TEST_DATA) && entry->data != data) - continue; - - /* do this last, since it is the most expensive */ - if (glob) { - kallsyms_lookup(entry->ip, NULL, NULL, - NULL, str); - if (!ftrace_match(str, glob, len, type)) - continue; - } - - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); - } - } - __disable_ftrace_function_probe(); - mutex_unlock(&ftrace_lock); -} - -void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - __unregister_ftrace_function_probe(glob, ops, data, - PROBE_TEST_FUNC | PROBE_TEST_DATA); -} - -void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) -{ - __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); -} - -void unregister_ftrace_function_probe_all(char *glob) -{ - __unregister_ftrace_function_probe(glob, NULL, NULL, 0); -} - -static LIST_HEAD(ftrace_commands); -static DEFINE_MUTEX(ftrace_cmd_mutex); - -int register_ftrace_command(struct ftrace_func_command *cmd) -{ - struct ftrace_func_command *p; - int ret = 0; - - mutex_lock(&ftrace_cmd_mutex); - list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } - } - list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - - return ret; -} - -int unregister_ftrace_command(struct ftrace_func_command *cmd) -{ - struct ftrace_func_command *p, *n; - int ret = -ENODEV; - - mutex_lock(&ftrace_cmd_mutex); - list_for_each_entry_safe(p, n, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = 0; - list_del_init(&p->list); - goto out_unlock; - } - } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - - return ret; -} - -static int ftrace_process_regex(struct ftrace_hash *hash, - char *buff, int len, int enable) -{ - char *func, *command, *next = buff; - struct ftrace_func_command *p; - int ret = -EINVAL; - - func = strsep(&next, ":"); - - if (!next) { - ret = ftrace_match_records(hash, func, len); - if (!ret) - ret = -EINVAL; - if (ret < 0) - return ret; - return 0; - } - - /* command found */ - - command = strsep(&next, ":"); - - mutex_lock(&ftrace_cmd_mutex); - list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(hash, func, command, next, enable); - goto out_unlock; - } - } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - - return ret; -} - -static ssize_t -ftrace_regex_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos, int enable) -{ - struct ftrace_iterator *iter; - struct trace_parser *parser; - ssize_t ret, read; - - if (!cnt) - return 0; - - mutex_lock(&ftrace_regex_lock); - - ret = -ENODEV; - if (unlikely(ftrace_disabled)) - goto out_unlock; - - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - iter = m->private; - } else - iter = file->private_data; - - parser = &iter->parser; - read = trace_get_user(parser, ubuf, cnt, ppos); - - if (read >= 0 && trace_parser_loaded(parser) && - !trace_parser_cont(parser)) { - ret = ftrace_process_regex(iter->hash, parser->buffer, - parser->idx, enable); - trace_parser_clear(parser); - if (ret) - goto out_unlock; - } - - ret = read; -out_unlock: - mutex_unlock(&ftrace_regex_lock); - - return ret; -} - -ssize_t -ftrace_filter_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return ftrace_regex_write(file, ubuf, cnt, ppos, 1); -} - -ssize_t -ftrace_notrace_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return ftrace_regex_write(file, ubuf, cnt, ppos, 0); -} - -static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) -{ - struct ftrace_hash **orig_hash; - struct ftrace_hash *hash; - int ret; - - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - if (enable) - orig_hash = &ops->filter_hash; - else - orig_hash = &ops->notrace_hash; - - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - return -ENOMEM; - - mutex_lock(&ftrace_regex_lock); - if (reset) - ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); - - mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); - - mutex_unlock(&ftrace_lock); - - mutex_unlock(&ftrace_regex_lock); - - free_ftrace_hash(hash); - return ret; -} - -/** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Filters denote which functions should be enabled when tracing is enabled. - * If @buf is NULL and reset is set, all functions will be enabled for tracing. - */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, - int len, int reset) -{ - ftrace_set_regex(ops, buf, len, reset, 1); -} -EXPORT_SYMBOL_GPL(ftrace_set_filter); - -/** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Notrace Filters denote which functions should not be enabled when tracing - * is enabled. If @buf is NULL and reset is set, all functions will be enabled - * for tracing. - */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, - int len, int reset) -{ - ftrace_set_regex(ops, buf, len, reset, 0); -} -EXPORT_SYMBOL_GPL(ftrace_set_notrace); -/** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Filters denote which functions should be enabled when tracing is enabled. - * If @buf is NULL and reset is set, all functions will be enabled for tracing. - */ -void ftrace_set_global_filter(unsigned char *buf, int len, int reset) -{ - ftrace_set_regex(&global_ops, buf, len, reset, 1); -} -EXPORT_SYMBOL_GPL(ftrace_set_global_filter); - -/** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. - * - * Notrace Filters denote which functions should not be enabled when tracing - * is enabled. If @buf is NULL and reset is set, all functions will be enabled - * for tracing. - */ -void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) -{ - ftrace_set_regex(&global_ops, buf, len, reset, 0); -} -EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); - -/* - * command line interface to allow users to set filters on boot up. - */ -#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE -static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; -static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; - -static int __init set_ftrace_notrace(char *str) -{ - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); - return 1; -} -__setup("ftrace_notrace=", set_ftrace_notrace); - -static int __init set_ftrace_filter(char *str) -{ - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); - return 1; -} -__setup("ftrace_filter=", set_ftrace_filter); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); - -static int __init set_graph_function(char *str) -{ - strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); - return 1; -} -__setup("ftrace_graph_filter=", set_graph_function); - -static void __init set_ftrace_early_graph(char *buf) -{ - int ret; - char *func; - - while (buf) { - func = strsep(&buf, ","); - /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - func); - if (ret) - printk(KERN_DEBUG "ftrace: function %s not " - "traceable\n", func); - } -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -void __init -ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) -{ - char *func; - - while (buf) { - func = strsep(&buf, ","); - ftrace_set_regex(ops, func, strlen(func), 0, enable); - } -} - -static void __init set_ftrace_early_filters(void) -{ - if (ftrace_filter_buf[0]) - ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); - if (ftrace_notrace_buf[0]) - ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (ftrace_graph_buf[0]) - set_ftrace_early_graph(ftrace_graph_buf); -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -} - -int ftrace_regex_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter; - struct ftrace_hash **orig_hash; - struct trace_parser *parser; - int filter_hash; - int ret; - - mutex_lock(&ftrace_regex_lock); - if (file->f_mode & FMODE_READ) { - iter = m->private; - - seq_release(inode, file); - } else - iter = file->private_data; - - parser = &iter->parser; - if (trace_parser_loaded(parser)) { - parser->buffer[parser->idx] = 0; - ftrace_match_records(iter->hash, parser->buffer, parser->idx); - } - - trace_parser_put(parser); - - if (file->f_mode & FMODE_WRITE) { - filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); - - if (filter_hash) - orig_hash = &iter->ops->filter_hash; - else - orig_hash = &iter->ops->notrace_hash; - - mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(iter->ops, filter_hash, - orig_hash, iter->hash); - if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); - - mutex_unlock(&ftrace_lock); - } - free_ftrace_hash(iter->hash); - kfree(iter); - - mutex_unlock(&ftrace_regex_lock); - return 0; -} - -static const struct file_operations ftrace_avail_fops = { - .open = ftrace_avail_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static const struct file_operations ftrace_enabled_fops = { - .open = ftrace_enabled_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static const struct file_operations ftrace_filter_fops = { - .open = ftrace_filter_open, - .read = seq_read, - .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, - .release = ftrace_regex_release, -}; - -static const struct file_operations ftrace_notrace_fops = { - .open = ftrace_notrace_open, - .read = seq_read, - .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, - .release = ftrace_regex_release, -}; - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -static DEFINE_MUTEX(graph_lock); - -int ftrace_graph_count; -int ftrace_graph_filter_enabled; -unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; - -static void * -__g_next(struct seq_file *m, loff_t *pos) -{ - if (*pos >= ftrace_graph_count) - return NULL; - return &ftrace_graph_funcs[*pos]; -} - -static void * -g_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return __g_next(m, pos); -} - -static void *g_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&graph_lock); - - /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_filter_enabled && !*pos) - return (void *)1; - - return __g_next(m, pos); -} - -static void g_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&graph_lock); -} - -static int g_show(struct seq_file *m, void *v) -{ - unsigned long *ptr = v; - - if (!ptr) - return 0; - - if (ptr == (unsigned long *)1) { - seq_printf(m, "#### all functions enabled ####\n"); - return 0; - } - - seq_printf(m, "%ps\n", (void *)*ptr); - - return 0; -} - -static const struct seq_operations ftrace_graph_seq_ops = { - .start = g_start, - .next = g_next, - .stop = g_stop, - .show = g_show, -}; - -static int -ftrace_graph_open(struct inode *inode, struct file *file) -{ - int ret = 0; - - if (unlikely(ftrace_disabled)) - return -ENODEV; - - mutex_lock(&graph_lock); - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - ftrace_graph_filter_enabled = 0; - ftrace_graph_count = 0; - memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); - } - mutex_unlock(&graph_lock); - - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_graph_seq_ops); - - return ret; -} - -static int -ftrace_graph_release(struct inode *inode, struct file *file) -{ - if (file->f_mode & FMODE_READ) - seq_release(inode, file); - return 0; -} - -static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - int search_len; - int fail = 1; - int type, not; - char *search; - bool exists; - int i; - - /* decode regex */ - type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) - return -EBUSY; - - search_len = strlen(search); - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) { - mutex_unlock(&ftrace_lock); - return -ENODEV; - } - - do_for_each_ftrace_rec(pg, rec) { - - if (ftrace_match_record(rec, NULL, search, search_len, type)) { - /* if it is in the array */ - exists = false; - for (i = 0; i < *idx; i++) { - if (array[i] == rec->ip) { - exists = true; - break; - } - } - - if (!not) { - fail = 0; - if (!exists) { - array[(*idx)++] = rec->ip; - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) - goto out; - } - } else { - if (exists) { - array[i] = array[--(*idx)]; - array[*idx] = 0; - fail = 0; - } - } - } - } while_for_each_ftrace_rec(); -out: - mutex_unlock(&ftrace_lock); - - if (fail) - return -EINVAL; - - ftrace_graph_filter_enabled = 1; - return 0; -} - -static ssize_t -ftrace_graph_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_parser parser; - ssize_t read, ret; - - if (!cnt) - return 0; - - mutex_lock(&graph_lock); - - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { - ret = -ENOMEM; - goto out_unlock; - } - - read = trace_get_user(&parser, ubuf, cnt, ppos); - - if (read >= 0 && trace_parser_loaded((&parser))) { - parser.buffer[parser.idx] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - parser.buffer); - if (ret) - goto out_free; - } - - ret = read; - -out_free: - trace_parser_put(&parser); -out_unlock: - mutex_unlock(&graph_lock); - - return ret; -} - -static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, - .release = ftrace_graph_release, - .llseek = seq_lseek, -}; -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - - trace_create_file("enabled_functions", 0444, - d_tracer, NULL, &ftrace_enabled_fops); - - trace_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - - trace_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0444, d_tracer, - NULL, - &ftrace_graph_fops); -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - return 0; -} - -static void ftrace_swap_recs(void *a, void *b, int size) -{ - struct dyn_ftrace *reca = a; - struct dyn_ftrace *recb = b; - struct dyn_ftrace t; - - t = *reca; - *reca = *recb; - *recb = t; -} - -static int ftrace_process_locs(struct module *mod, - unsigned long *start, - unsigned long *end) -{ - struct ftrace_page *pg; - unsigned long count; - unsigned long *p; - unsigned long addr; - unsigned long flags = 0; /* Shut up gcc */ - int ret = -ENOMEM; - - count = end - start; - - if (!count) - return 0; - - pg = ftrace_allocate_pages(count); - if (!pg) - return -ENOMEM; - - mutex_lock(&ftrace_lock); - - /* - * Core and each module needs their own pages, as - * modules will free them when they are removed. - * Force a new page to be allocated for modules. - */ - if (!mod) { - WARN_ON(ftrace_pages || ftrace_pages_start); - /* First initialization */ - ftrace_pages = ftrace_pages_start = pg; - } else { - if (!ftrace_pages) - goto out; - - if (WARN_ON(ftrace_pages->next)) { - /* Hmm, we have free pages? */ - while (ftrace_pages->next) - ftrace_pages = ftrace_pages->next; - } - - ftrace_pages->next = pg; - ftrace_pages = pg; - } - - p = start; - while (p < end) { - addr = ftrace_call_adjust(*p++); - /* - * Some architecture linkers will pad between - * the different mcount_loc sections of different - * object files to satisfy alignments. - * Skip any NULL pointers. - */ - if (!addr) - continue; - if (!ftrace_record_ip(addr)) - break; - } - - /* These new locations need to be initialized */ - ftrace_new_pgs = pg; - - /* Make each individual set of pages sorted by ips */ - for (; pg; pg = pg->next) - sort(pg->records, pg->index, sizeof(struct dyn_ftrace), - ftrace_cmp_recs, ftrace_swap_recs); - - /* - * We only need to disable interrupts on start up - * because we are modifying code that an interrupt - * may execute, and the modification is not atomic. - * But for modules, nothing runs the code we modify - * until we are finished with it, and there's no - * reason to cause large interrupt latencies while we do it. - */ - if (!mod) - local_irq_save(flags); - ftrace_update_code(mod); - if (!mod) - local_irq_restore(flags); - ret = 0; - out: - mutex_unlock(&ftrace_lock); - - return ret; -} - -#ifdef CONFIG_MODULES - -#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) - -void ftrace_release_mod(struct module *mod) -{ - struct dyn_ftrace *rec; - struct ftrace_page **last_pg; - struct ftrace_page *pg; - int order; - - mutex_lock(&ftrace_lock); - - if (ftrace_disabled) - goto out_unlock; - - /* - * Each module has its own ftrace_pages, remove - * them from the list. - */ - last_pg = &ftrace_pages_start; - for (pg = ftrace_pages_start; pg; pg = *last_pg) { - rec = &pg->records[0]; - if (within_module_core(rec->ip, mod)) { - /* - * As core pages are first, the first - * page should never be a module page. - */ - if (WARN_ON(pg == ftrace_pages_start)) - goto out_unlock; - - /* Check if we are deleting the last page */ - if (pg == ftrace_pages) - ftrace_pages = next_to_ftrace_page(last_pg); - - *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - kfree(pg); - } else - last_pg = &pg->next; - } - out_unlock: - mutex_unlock(&ftrace_lock); -} - -static void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) -{ - if (ftrace_disabled || start == end) - return; - ftrace_process_locs(mod, start, end); -} - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -#endif /* CONFIG_MODULES */ - -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = 0, -}; - -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; - -void __init ftrace_init(void) -{ - unsigned long count, addr, flags; - int ret; - - /* Keep the ftrace pointer to the stub */ - addr = (unsigned long)ftrace_stub; - - local_irq_save(flags); - ftrace_dyn_arch_init(&addr); - local_irq_restore(flags); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) - goto failed; - - count = __stop_mcount_loc - __start_mcount_loc; - - ret = ftrace_dyn_table_alloc(count); - if (ret) - goto failed; - - last_ftrace_enabled = ftrace_enabled = 1; - - ret = ftrace_process_locs(NULL, - __start_mcount_loc, - __stop_mcount_loc); - - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module notifier\n"); - - set_ftrace_early_filters(); - - return; - failed: - ftrace_disabled = 1; -} - -#else - -static struct ftrace_ops global_ops = { - .func = ftrace_stub, -}; - -static int __init ftrace_nodyn_init(void) -{ - ftrace_enabled = 1; - return 0; -} -device_initcall(ftrace_nodyn_init); - -static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } -static inline void ftrace_startup_enable(int command) { } -/* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ - }) -# define ftrace_shutdown(ops, command) do { } while (0) -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) - -static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) -{ - return 1; -} - -#endif /* CONFIG_DYNAMIC_FTRACE */ - -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) -{ - struct ftrace_ops *op; - - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) - return; - - trace_recursion_set(TRACE_INTERNAL_BIT); - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). - */ - preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { - if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); - op = rcu_dereference_raw(op->next); - }; - preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); -} - -static void clear_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void set_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); -} - -static void clear_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; - - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); - - put_pid(pid); -} - -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; - - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} - -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} - -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); -} - -static int ftrace_pid_add(int p) -{ - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); - - ftrace_update_pid_func(); - ftrace_startup_enable(0); - - mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; -} - -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } - - ftrace_update_pid_func(); - ftrace_startup_enable(0); - - mutex_unlock(&ftrace_lock); -} - -static void *fpid_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&ftrace_lock); - - if (list_empty(&ftrace_pids) && (!*pos)) - return (void *) 1; - - return seq_list_start(&ftrace_pids, *pos); -} - -static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) -{ - if (v == (void *)1) - return NULL; - - return seq_list_next(v, &ftrace_pids, pos); -} - -static void fpid_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&ftrace_lock); -} - -static int fpid_show(struct seq_file *m, void *v) -{ - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { - seq_printf(m, "no pid\n"); - return 0; - } - - if (fpid->pid == ftrace_swapper_pid) - seq_printf(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; -} - -static const struct seq_operations ftrace_pid_sops = { - .start = fpid_start, - .next = fpid_next, - .stop = fpid_stop, - .show = fpid_show, -}; - -static int -ftrace_pid_open(struct inode *inode, struct file *file) -{ - int ret = 0; - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); - - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); - - return ret; -} - -static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64], *tmp; - long val; - int ret; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. - */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; - - ret = strict_strtol(tmp, 10, &val); - if (ret < 0) - return ret; - - ret = ftrace_pid_add(val); - - return ret ? ret : cnt; -} - -static int -ftrace_pid_release(struct inode *inode, struct file *file) -{ - if (file->f_mode & FMODE_READ) - seq_release(inode, file); - - return 0; -} - -static const struct file_operations ftrace_pid_fops = { - .open = ftrace_pid_open, - .write = ftrace_pid_write, - .read = seq_read, - .llseek = seq_lseek, - .release = ftrace_pid_release, -}; - -static __init int ftrace_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; - - ftrace_init_dyn_debugfs(d_tracer); - - trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - - ftrace_profile_debugfs(d_tracer); - - return 0; -} -fs_initcall(ftrace_init_debugfs); - -/** - * ftrace_kill - kill ftrace - * - * This function should be used by panic code. It stops ftrace - * but in a not so nice way. If you need to simply kill ftrace - * from a non-atomic section, use ftrace_kill. - */ -void ftrace_kill(void) -{ - ftrace_disabled = 1; - ftrace_enabled = 0; - clear_ftrace_function(); -} - -/** - * Test if ftrace is dead or not. - */ -int ftrace_is_dead(void) -{ - return ftrace_disabled; -} - -/** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. - * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. - */ -int register_ftrace_function(struct ftrace_ops *ops) -{ - int ret = -1; - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; - - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); - - - out_unlock: - mutex_unlock(&ftrace_lock); - return ret; -} -EXPORT_SYMBOL_GPL(register_ftrace_function); - -/** - * unregister_ftrace_function - unregister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) -{ - int ret; - - mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); - mutex_unlock(&ftrace_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_function); - -int -ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = -ENODEV; - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; - - last_ftrace_enabled = !!ftrace_enabled; - - if (ftrace_enabled) { - - ftrace_startup_sysctl(); - - /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } - - } else { - /* stopping ftrace calls (just send to ftrace_stub) */ - ftrace_trace_function = ftrace_stub; - - ftrace_shutdown_sysctl(); - } - - out: - mutex_unlock(&ftrace_lock); - return ret; -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; - -int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) -{ - return 0; -} - -/* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; -trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; - -/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ -static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) -{ - int i; - int ret = 0; - unsigned long flags; - int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; - struct task_struct *g, *t; - - for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack_list[i]) { - start = 0; - end = i; - ret = -ENOMEM; - goto free; - } - } - - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { - if (start == end) { - ret = -EAGAIN; - goto unlock; - } - - if (t->ret_stack == NULL) { - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->curr_ret_stack = -1; - /* Make sure the tasks see the -1 first: */ - smp_wmb(); - t->ret_stack = ret_stack_list[start++]; - } - } while_each_thread(g, t); - -unlock: - read_unlock_irqrestore(&tasklist_lock, flags); -free: - for (i = start; i < end; i++) - kfree(ret_stack_list[i]); - return ret; -} - -static void -ftrace_graph_probe_sched_switch(void *ignore, - struct task_struct *prev, struct task_struct *next) -{ - unsigned long long timestamp; - int index; - - /* - * Does the user want to count the time a function was asleep. - * If so, do not update the time stamps. - */ - if (trace_flags & TRACE_ITER_SLEEP_TIME) - return; - - timestamp = trace_clock_local(); - - prev->ftrace_timestamp = timestamp; - - /* only process tasks that we timestamped */ - if (!next->ftrace_timestamp) - return; - - /* - * Update all the counters in next to make up for the - * time next was sleeping. - */ - timestamp -= next->ftrace_timestamp; - - for (index = next->curr_ret_stack; index >= 0; index--) - next->ret_stack[index].calltime += timestamp; -} - -/* Allocate a return stack for each task */ -static int start_graph_tracing(void) -{ - struct ftrace_ret_stack **ret_stack_list; - int ret, cpu; - - ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); - - if (!ret_stack_list) - return -ENOMEM; - - /* The cpu_boot init_task->ret_stack will never be freed */ - for_each_online_cpu(cpu) { - if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_idle_task(idle_task(cpu), cpu); - } - - do { - ret = alloc_retstack_tasklist(ret_stack_list); - } while (ret == -EAGAIN); - - if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - if (ret) - pr_info("ftrace_graph: Couldn't activate tracepoint" - " probe to kernel_sched_switch\n"); - } - - kfree(ret_stack_list); - return ret; -} - -/* - * Hibernation protection. - * The state of the current task is too much unstable during - * suspend/restore to disk. We want to protect against that. - */ -static int -ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, - void *unused) -{ - switch (state) { - case PM_HIBERNATION_PREPARE: - pause_graph_tracing(); - break; - - case PM_POST_HIBERNATION: - unpause_graph_tracing(); - break; - } - return NOTIFY_DONE; -} - -int register_ftrace_graph(trace_func_graph_ret_t retfunc, - trace_func_graph_ent_t entryfunc) -{ - int ret = 0; - - mutex_lock(&ftrace_lock); - - /* we currently allow only one tracer registered at a time */ - if (ftrace_graph_active) { - ret = -EBUSY; - goto out; - } - - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; - register_pm_notifier(&ftrace_suspend_notifier); - - ftrace_graph_active++; - ret = start_graph_tracing(); - if (ret) { - ftrace_graph_active--; - goto out; - } - - ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); - -out: - mutex_unlock(&ftrace_lock); - return ret; -} - -void unregister_ftrace_graph(void) -{ - mutex_lock(&ftrace_lock); - - if (unlikely(!ftrace_graph_active)) - goto out; - - ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; - ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - unregister_pm_notifier(&ftrace_suspend_notifier); - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - - out: - mutex_unlock(&ftrace_lock); -} - -static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); - -static void -graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) -{ - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visible before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; -} - -/* - * Allocate a return stack for the idle task. May be the first - * time through, or it may be done by CPU hotplug online. - */ -void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) -{ - t->curr_ret_stack = -1; - /* - * The idle task has no parent, it either has its own - * stack or no stack at all. - */ - if (t->ret_stack) - WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = per_cpu(idle_ret_stack, cpu); - if (!ret_stack) { - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - per_cpu(idle_ret_stack, cpu) = ret_stack; - } - graph_init_task(t, ret_stack); - } -} - -/* Allocate a return stack for newly created task */ -void ftrace_graph_init_task(struct task_struct *t) -{ - /* Make sure we do not use the parent ret_stack */ - t->ret_stack = NULL; - t->curr_ret_stack = -1; - - if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; - - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); - if (!ret_stack) - return; - graph_init_task(t, ret_stack); - } -} - -void ftrace_graph_exit_task(struct task_struct *t) -{ - struct ftrace_ret_stack *ret_stack = t->ret_stack; - - t->ret_stack = NULL; - /* NULL must become visible to IRQs before we free it: */ - barrier(); - - kfree(ret_stack); -} - -void ftrace_graph_stop(void) -{ - ftrace_stop(); -} -#endif -/* - * Power trace points - * - * Copyright (C) 2009 Arjan van de Ven - */ - -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif -EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); - -/* - * Generic ring buffer - * - * Copyright (C) 2008 Steven Rostedt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "trace.h" - -/* - * The ring buffer header is special. We must manually up keep it. - */ -int ring_buffer_print_entry_header(struct trace_seq *s) -{ - int ret; - - ret = trace_seq_printf(s, "# compressed entry header\n"); - ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_printf(s, "\tarray : 32 bits\n"); - ret = trace_seq_printf(s, "\n"); - ret = trace_seq_printf(s, "\tpadding : type == %d\n", - RINGBUF_TYPE_PADDING); - ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", - RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata max type_len == %d\n", - RINGBUF_TYPE_DATA_TYPE_LEN_MAX); - - return ret; -} - -/* - * The ring buffer is made up of a list of pages. A separate list of pages is - * allocated for each CPU. A writer may only write to a buffer that is - * associated with the CPU it is currently executing on. A reader may read - * from any per cpu buffer. - * - * The reader is special. For each per cpu buffer, the reader has its own - * reader page. When a reader has read the entire reader page, this reader - * page is swapped with another page in the ring buffer. - * - * Now, as long as the writer is off the reader page, the reader can do what - * ever it wants with that page. The writer will never write to that page - * again (as long as it is out of the ring buffer). - * - * Here's some silly ASCII art. - * - * +------+ - * |reader| RING BUFFER - * |page | - * +------+ +---+ +---+ +---+ - * | |-->| |-->| | - * +---+ +---+ +---+ - * ^ | - * | | - * +---------------+ - * - * - * +------+ - * |reader| RING BUFFER - * |page |------------------v - * +------+ +---+ +---+ +---+ - * | |-->| |-->| | - * +---+ +---+ +---+ - * ^ | - * | | - * +---------------+ - * - * - * +------+ - * |reader| RING BUFFER - * |page |------------------v - * +------+ +---+ +---+ +---+ - * ^ | |-->| |-->| | - * | +---+ +---+ +---+ - * | | - * | | - * +------------------------------+ - * - * - * +------+ - * |buffer| RING BUFFER - * |page |------------------v - * +------+ +---+ +---+ +---+ - * ^ | | | |-->| | - * | New +---+ +---+ +---+ - * | Reader------^ | - * | page | - * +------------------------------+ - * - * - * After we make this swap, the reader can hand this page off to the splice - * code and be done with it. It can even allocate a new page if it needs to - * and swap that into the ring buffer. - * - * We will be using cmpxchg soon to make all this lockless. - * - */ - -/* - * A fast way to enable or disable all ring buffers is to - * call tracing_on or tracing_off. Turning off the ring buffers - * prevents all ring buffers from being recorded to. - * Turning this switch on, makes it OK to write to the - * ring buffer, if the ring buffer is enabled itself. - * - * There's three layers that must be on in order to write - * to the ring buffer. - * - * 1) This global flag must be set. - * 2) The ring buffer must be enabled for recording. - * 3) The per cpu buffer must be enabled for recording. - * - * In case of an anomaly, this global flag has a bit set that - * will permantly disable all ring buffers. - */ - -/* - * Global flag to disable all recording to ring buffers - * This has two bits: ON, DISABLED - * - * ON DISABLED - * ---- ---------- - * 0 0 : ring buffers are off - * 1 0 : ring buffers are on - * X 1 : ring buffers are permanently disabled - */ - -enum { - RB_BUFFERS_ON_BIT = 0, - RB_BUFFERS_DISABLED_BIT = 1, -}; - -enum { - RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, - RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, -}; - -static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; - -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ - set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); - -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ - clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); - -/** - * tracing_off_permanent - permanently disable ring buffers - * - * This function, once called, will disable all ring buffers - * permanently. - */ -void tracing_off_permanent(void) -{ - set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); -} - -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ - return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ -#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX - -enum { - RB_LEN_TIME_EXTEND = 8, - RB_LEN_TIME_STAMP = 16, -}; - -#define skip_time_extend(event) \ - ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) - -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; -} - -static void rb_event_set_padding(struct ring_buffer_event *event) -{ - /* padding has a NULL time_delta */ - event->type_len = RINGBUF_TYPE_PADDING; - event->time_delta = 0; -} - -static unsigned -rb_event_data_length(struct ring_buffer_event *event) -{ - unsigned length; - - if (event->type_len) - length = event->type_len * RB_ALIGNMENT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; -} - -/* - * Return the length of the given event. Will return - * the length of the time extend if the event is a - * time extend. - */ -static inline unsigned -rb_event_length(struct ring_buffer_event *event) -{ - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - if (rb_null_event(event)) - /* undefined */ - return -1; - return event->array[0] + RB_EVNT_HDR_SIZE; - - case RINGBUF_TYPE_TIME_EXTEND: - return RB_LEN_TIME_EXTEND; - - case RINGBUF_TYPE_TIME_STAMP: - return RB_LEN_TIME_STAMP; - - case RINGBUF_TYPE_DATA: - return rb_event_data_length(event); - default: - BUG(); - } - /* not hit */ - return 0; -} - -/* - * Return total length of time extend and data, - * or just the event length for all other events. - */ -static inline unsigned -rb_event_ts_length(struct ring_buffer_event *event) -{ - unsigned len = 0; - - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - /* time extends include the data event after it */ - len = RB_LEN_TIME_EXTEND; - event = skip_time_extend(event); - } - return len + rb_event_length(event); -} - -/** - * ring_buffer_event_length - return the length of the event - * @event: the event to get the length of - * - * Returns the size of the data load of a data event. - * If the event is something other than a data event, it - * returns the size of the event itself. With the exception - * of a TIME EXTEND, where it still returns the size of the - * data load of the data event after it. - */ -unsigned ring_buffer_event_length(struct ring_buffer_event *event) -{ - unsigned length; - - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - length = rb_event_length(event); - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - return length; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) - length -= sizeof(event->array[0]); - return length; -} -EXPORT_SYMBOL_GPL(ring_buffer_event_length); - -/* inline for ring buffer fast paths */ -static void * -rb_event_data(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); - /* If length is in len field, then array[0] has the data */ - if (event->type_len) - return (void *)&event->array[0]; - /* Otherwise length is in array[0] and array[1] has the data */ - return (void *)&event->array[1]; -} - -/** - * ring_buffer_event_data - return the data of the event - * @event: the event to get the data from - */ -void *ring_buffer_event_data(struct ring_buffer_event *event) -{ - return rb_event_data(event); -} -EXPORT_SYMBOL_GPL(ring_buffer_event_data); - -#define for_each_buffer_cpu(buffer, cpu) \ - for_each_cpu(cpu, buffer->cpumask) - -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - -/* Flag when events were overwritten */ -#define RB_MISSED_EVENTS (1 << 31) -/* Missed count stored at end */ -#define RB_MISSED_STORED (1 << 30) - -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[]; /* data of buffer page */ -}; - -/* - * Note, the buffer_page list must be first. The buffer pages - * are allocated in cache lines, which means that each buffer - * page will be at the beginning of a cache line, and thus - * the least significant bits will be zero. We use this to - * add flags in the list struct pointers, to make the ring buffer - * lockless. - */ -struct buffer_page { - struct list_head list; /* list of buffer pages */ - local_t write; /* index for next write */ - unsigned read; /* index for next read */ - local_t entries; /* entries on this page */ - unsigned long real_end; /* real end of data */ - struct buffer_data_page *page; /* Actual data page */ -}; - -/* - * The buffer page counters, write and entries, must be reset - * atomically when crossing page boundaries. To synchronize this - * update, two counters are inserted into the number. One is - * the actual counter for the write position or count on the page. - * - * The other is a counter of updaters. Before an update happens - * the update partition of the counter is incremented. This will - * allow the updater to update the counter atomically. - * - * The counter is 20 bits, and the state data is 12. - */ -#define RB_WRITE_MASK 0xfffff -#define RB_WRITE_INTCNT (1 << 20) - -static void rb_init_page(struct buffer_data_page *bpage) -{ - local_set(&bpage->commit, 0); -} - -/** - * ring_buffer_page_len - the size of data on the page. - * @page: The page to read - * - * Returns the amount of data on the page, including buffer page header. - */ -size_t ring_buffer_page_len(void *page) -{ - return local_read(&((struct buffer_data_page *)page)->commit) - + BUF_PAGE_HDR_SIZE; -} - -/* - * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing - * this issue out. - */ -static void free_buffer_page(struct buffer_page *bpage) -{ - free_page((unsigned long)bpage->page); - kfree(bpage); -} - -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline int test_time_stamp(u64 delta) -{ - if (delta & TS_DELTA_TEST) - return 1; - return 0; -} - -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - int ret; - - ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return ret; -} - -/* - * head_page == tail_page && head == tail then buffer is empty. - */ -struct ring_buffer_per_cpu { - int cpu; - atomic_t record_disabled; - struct ring_buffer *buffer; - raw_spinlock_t reader_lock; /* serialize readers */ - arch_spinlock_t lock; - struct lock_class_key lock_key; - struct list_head *pages; - struct buffer_page *head_page; /* read from head */ - struct buffer_page *tail_page; /* write to tail */ - struct buffer_page *commit_page; /* committed pages */ - struct buffer_page *reader_page; - unsigned long lost_events; - unsigned long last_overrun; - local_t entries_bytes; - local_t commit_overrun; - local_t overrun; - local_t entries; - local_t committing; - local_t commits; - unsigned long read; - unsigned long read_bytes; - u64 write_stamp; - u64 read_stamp; -}; - -struct ring_buffer { - unsigned pages; - unsigned flags; - int cpus; - atomic_t record_disabled; - cpumask_var_t cpumask; - - struct lock_class_key *reader_lock_key; - - struct mutex mutex; - - struct ring_buffer_per_cpu **buffers; - -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif - u64 (*clock)(void); -}; - -struct ring_buffer_iter { - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long head; - struct buffer_page *head_page; - struct buffer_page *cache_reader_page; - unsigned long cache_read; - u64 read_stamp; -}; - -/* buffer may be either ring_buffer or ring_buffer_per_cpu */ -#define RB_WARN_ON(b, cond) \ - ({ \ - int _____ret = unlikely(cond); \ - if (_____ret) { \ - if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ - struct ring_buffer_per_cpu *__b = \ - (void *)b; \ - atomic_inc(&__b->buffer->record_disabled); \ - } else \ - atomic_inc(&b->record_disabled); \ - WARN_ON(1); \ - } \ - _____ret; \ - }) - -/* Up this if you want to test the TIME_EXTENTS and normalization */ -#define DEBUG_SHIFT 0 - -static inline u64 rb_time_stamp(struct ring_buffer *buffer) -{ - /* shift to debug/test normalization and TIME_EXTENTS */ - return buffer->clock() << DEBUG_SHIFT; -} - -u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) -{ - u64 time; - - preempt_disable_notrace(); - time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); - - return time; -} -EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); - -void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, - int cpu, u64 *ts) -{ - /* Just stupid testing the normalize function and deltas */ - *ts >>= DEBUG_SHIFT; -} -EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); - -/* - * Making the ring buffer lockless makes things tricky. - * Although writes only happen on the CPU that they are on, - * and they only need to worry about interrupts. Reads can - * happen on any CPU. - * - * The reader page is always off the ring buffer, but when the - * reader finishes with a page, it needs to swap its page with - * a new one from the buffer. The reader needs to take from - * the head (writes go to the tail). But if a writer is in overwrite - * mode and wraps, it must push the head page forward. - * - * Here lies the problem. - * - * The reader must be careful to replace only the head page, and - * not another one. As described at the top of the file in the - * ASCII art, the reader sets its old page to point to the next - * page after head. It then sets the page after head to point to - * the old reader page. But if the writer moves the head page - * during this operation, the reader could end up with the tail. - * - * We use cmpxchg to help prevent this race. We also do something - * special with the page before head. We set the LSB to 1. - * - * When the writer must push the page forward, it will clear the - * bit that points to the head page, move the head, and then set - * the bit that points to the new head page. - * - * We also don't want an interrupt coming in and moving the head - * page on another writer. Thus we use the second LSB to catch - * that too. Thus: - * - * head->list->prev->next bit 1 bit 0 - * ------- ------- - * Normal page 0 0 - * Points to head page 0 1 - * New head page 1 0 - * - * Note we can not trust the prev pointer of the head page, because: - * - * +----+ +-----+ +-----+ - * | |------>| T |---X--->| N | - * | |<------| | | | - * +----+ +-----+ +-----+ - * ^ ^ | - * | +-----+ | | - * +----------| R |----------+ | - * | |<-----------+ - * +-----+ - * - * Key: ---X--> HEAD flag set in pointer - * T Tail page - * R Reader page - * N Next page - * - * (see __rb_reserve_next() to see where this happens) - * - * What the above shows is that the reader just swapped out - * the reader page with a page in the buffer, but before it - * could make the new header point back to the new page added - * it was preempted by a writer. The writer moved forward onto - * the new page added by the reader and is about to move forward - * again. - * - * You can see, it is legitimate for the previous pointer of - * the head (or any page) not to point back to itself. But only - * temporarially. - */ - -#define RB_PAGE_NORMAL 0UL -#define RB_PAGE_HEAD 1UL -#define RB_PAGE_UPDATE 2UL - - -#define RB_FLAG_MASK 3UL - -/* PAGE_MOVED is not part of the mask */ -#define RB_PAGE_MOVED 4UL - -/* - * rb_list_head - remove any bit - */ -static struct list_head *rb_list_head(struct list_head *list) -{ - unsigned long val = (unsigned long)list; - - return (struct list_head *)(val & ~RB_FLAG_MASK); -} - -/* - * rb_is_head_page - test if the given page is the head page - * - * Because the reader may move the head_page pointer, we can - * not trust what the head page is (it may be pointing to - * the reader page). But if the next page is a header page, - * its flags will be non zero. - */ -static inline int -rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *page, struct list_head *list) -{ - unsigned long val; - - val = (unsigned long)list->next; - - if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) - return RB_PAGE_MOVED; - - return val & RB_FLAG_MASK; -} - -/* - * rb_is_reader_page - * - * The unique thing about the reader page, is that, if the - * writer is ever on it, the previous pointer never points - * back to the reader page. - */ -static int rb_is_reader_page(struct buffer_page *page) -{ - struct list_head *list = page->list.prev; - - return rb_list_head(list->next) != &page->list; -} - -/* - * rb_set_list_to_head - set a list_head to be pointing to head. - */ -static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *list) -{ - unsigned long *ptr; - - ptr = (unsigned long *)&list->next; - *ptr |= RB_PAGE_HEAD; - *ptr &= ~RB_PAGE_UPDATE; -} - -/* - * rb_head_page_activate - sets up head page - */ -static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct buffer_page *head; - - head = cpu_buffer->head_page; - if (!head) - return; - - /* - * Set the previous list pointer to have the HEAD flag. - */ - rb_set_list_to_head(cpu_buffer, head->list.prev); -} - -static void rb_list_head_clear(struct list_head *list) -{ - unsigned long *ptr = (unsigned long *)&list->next; - - *ptr &= ~RB_FLAG_MASK; -} - -/* - * rb_head_page_dactivate - clears head page ptr (for free list) - */ -static void -rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct list_head *hd; - - /* Go through the whole list and clear any pointers found. */ - rb_list_head_clear(cpu_buffer->pages); - - list_for_each(hd, cpu_buffer->pages) - rb_list_head_clear(hd); -} - -static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *head, - struct buffer_page *prev, - int old_flag, int new_flag) -{ - struct list_head *list; - unsigned long val = (unsigned long)&head->list; - unsigned long ret; - - list = &prev->list; - - val &= ~RB_FLAG_MASK; - - ret = cmpxchg((unsigned long *)&list->next, - val | old_flag, val | new_flag); - - /* check if the reader took the page */ - if ((ret & ~RB_FLAG_MASK) != val) - return RB_PAGE_MOVED; - - return ret & RB_FLAG_MASK; -} - -static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *head, - struct buffer_page *prev, - int old_flag) -{ - return rb_head_page_set(cpu_buffer, head, prev, - old_flag, RB_PAGE_UPDATE); -} - -static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *head, - struct buffer_page *prev, - int old_flag) -{ - return rb_head_page_set(cpu_buffer, head, prev, - old_flag, RB_PAGE_HEAD); -} - -static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *head, - struct buffer_page *prev, - int old_flag) -{ - return rb_head_page_set(cpu_buffer, head, prev, - old_flag, RB_PAGE_NORMAL); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = rb_list_head((*bpage)->list.next); - - *bpage = list_entry(p, struct buffer_page, list); -} - -static struct buffer_page * -rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct buffer_page *head; - struct buffer_page *page; - struct list_head *list; - int i; - - if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) - return NULL; - - /* sanity check */ - list = cpu_buffer->pages; - if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) - return NULL; - - page = head = cpu_buffer->head_page; - /* - * It is possible that the writer moves the header behind - * where we started, and we miss in one loop. - * A second loop should grab the header, but we'll do - * three loops just because I'm paranoid. - */ - for (i = 0; i < 3; i++) { - do { - if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { - cpu_buffer->head_page = page; - return page; - } - rb_inc_page(cpu_buffer, &page); - } while (page != head); - } - - RB_WARN_ON(cpu_buffer, 1); - - return NULL; -} - -static int rb_head_page_replace(struct buffer_page *old, - struct buffer_page *new) -{ - unsigned long *ptr = (unsigned long *)&old->list.prev->next; - unsigned long val; - unsigned long ret; - - val = *ptr & ~RB_FLAG_MASK; - val |= RB_PAGE_HEAD; - - ret = cmpxchg(ptr, val, (unsigned long)&new->list); - - return ret == val; -} - -/* - * rb_tail_page_update - move the tail page forward - * - * Returns 1 if moved tail page, 0 if someone else did. - */ -static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - struct buffer_page *next_page) -{ - struct buffer_page *old_tail; - unsigned long old_entries; - unsigned long old_write; - int ret = 0; - - /* - * The tail page now needs to be moved forward. - * - * We need to reset the tail page, but without messing - * with possible erasing of data brought in by interrupts - * that have moved the tail page and are currently on it. - * - * We add a counter to the write field to denote this. - */ - old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); - old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - - /* - * Just make sure we have seen our old_write and synchronize - * with any interrupts that come in. - */ - barrier(); - - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - /* Zero the write counter */ - unsigned long val = old_write & ~RB_WRITE_MASK; - unsigned long eval = old_entries & ~RB_WRITE_MASK; - - /* - * This will only succeed if an interrupt did - * not come in and change it. In which case, we - * do not want to modify it. - * - * We add (void) to let the compiler know that we do not care - * about the return value of these functions. We use the - * cmpxchg to only update if an interrupt did not already - * do it for us. If the cmpxchg fails, we don't care. - */ - (void)local_cmpxchg(&next_page->write, old_write, val); - (void)local_cmpxchg(&next_page->entries, old_entries, eval); - - /* - * No need to worry about races with clearing out the commit. - * it only can increment when a commit takes place. But that - * only happens in the outer most nested commit. - */ - local_set(&next_page->page->commit, 0); - - old_tail = cmpxchg(&cpu_buffer->tail_page, - tail_page, next_page); - - if (old_tail == tail_page) - ret = 1; - } - - return ret; -} - -static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *bpage) -{ - unsigned long val = (unsigned long)bpage; - - if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) - return 1; - - return 0; -} - -/** - * rb_check_list - make sure a pointer to a list has the last bits zero - */ -static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *list) -{ - if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) - return 1; - if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) - return 1; - return 0; -} - -/** - * check_pages - integrity check of buffer pages - * @cpu_buffer: CPU buffer with pages to test - * - * As a safety measure we check to make sure the data pages have not - * been corrupted. - */ -static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct list_head *head = cpu_buffer->pages; - struct buffer_page *bpage, *tmp; - - rb_head_page_deactivate(cpu_buffer); - - if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) - return -1; - if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) - return -1; - - if (rb_check_list(cpu_buffer, head)) - return -1; - - list_for_each_entry_safe(bpage, tmp, head, list) { - if (RB_WARN_ON(cpu_buffer, - bpage->list.next->prev != &bpage->list)) - return -1; - if (RB_WARN_ON(cpu_buffer, - bpage->list.prev->next != &bpage->list)) - return -1; - if (rb_check_list(cpu_buffer, &bpage->list)) - return -1; - } - - rb_head_page_activate(cpu_buffer); - - return 0; -} - -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) -{ - struct buffer_page *bpage, *tmp; - LIST_HEAD(pages); - unsigned i; - - WARN_ON(!nr_pages); - - for (i = 0; i < nr_pages; i++) { - struct page *page; - /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. - */ - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu_buffer->cpu)); - if (!bpage) - goto free_pages; - - rb_check_bpage(cpu_buffer, bpage); - - list_add(&bpage->list, &pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); - } - - /* - * The ring buffer page list is a circular list that does not - * start and end with a list head. All page list items point to - * other pages. - */ - cpu_buffer->pages = pages.next; - list_del(&pages); - - rb_check_pages(cpu_buffer); - - return 0; - - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - return -ENOMEM; -} - -static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *bpage; - struct page *page; - int ret; - - cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); - if (!cpu_buffer) - return NULL; - - cpu_buffer->cpu = cpu; - cpu_buffer->buffer = buffer; - raw_spin_lock_init(&cpu_buffer->reader_lock); - lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); - if (!bpage) - goto fail_free_buffer; - - rb_check_bpage(cpu_buffer, bpage); - - cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); - if (!page) - goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); - - INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - - ret = rb_allocate_pages(cpu_buffer, buffer->pages); - if (ret < 0) - goto fail_free_reader; - - cpu_buffer->head_page - = list_entry(cpu_buffer->pages, struct buffer_page, list); - cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; - - rb_head_page_activate(cpu_buffer); - - return cpu_buffer; - - fail_free_reader: - free_buffer_page(cpu_buffer->reader_page); - - fail_free_buffer: - kfree(cpu_buffer); - return NULL; -} - -static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct list_head *head = cpu_buffer->pages; - struct buffer_page *bpage, *tmp; - - free_buffer_page(cpu_buffer->reader_page); - - rb_head_page_deactivate(cpu_buffer); - - if (head) { - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - bpage = list_entry(head, struct buffer_page, list); - free_buffer_page(bpage); - } - - kfree(cpu_buffer); -} - -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); -#endif - -/** - * ring_buffer_alloc - allocate a new ring_buffer - * @size: the size in bytes per cpu that is needed. - * @flags: attributes to set for the ring buffer. - * - * Currently the only flag that is available is the RB_FL_OVERWRITE - * flag. This flag means that the buffer will overwrite old data - * when the buffer wraps. If this flag is not set, the buffer will - * drop data when the tail hits the head. - */ -struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, - struct lock_class_key *key) -{ - struct ring_buffer *buffer; - int bsize; - int cpu; - - /* keep it in its own cache line */ - buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), - GFP_KERNEL); - if (!buffer) - return NULL; - - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; - - buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - buffer->flags = flags; - buffer->clock = trace_clock_local; - buffer->reader_lock_key = key; - - /* need at least two pages */ - if (buffer->pages < 2) - buffer->pages = 2; - - /* - * In case of non-hotplug cpu, if the ring-buffer is allocated - * in early initcall, it will not be notified of secondary cpus. - * In that off case, we need to allocate for all possible cpus. - */ -#ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); - cpumask_copy(buffer->cpumask, cpu_online_mask); -#else - cpumask_copy(buffer->cpumask, cpu_possible_mask); -#endif - buffer->cpus = nr_cpu_ids; - - bsize = sizeof(void *) * nr_cpu_ids; - buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), - GFP_KERNEL); - if (!buffer->buffers) - goto fail_free_cpumask; - - for_each_buffer_cpu(buffer, cpu) { - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); - if (!buffer->buffers[cpu]) - goto fail_free_buffers; - } - -#ifdef CONFIG_HOTPLUG_CPU - buffer->cpu_notify.notifier_call = rb_cpu_notify; - buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); -#endif - - put_online_cpus(); - mutex_init(&buffer->mutex); - - return buffer; - - fail_free_buffers: - for_each_buffer_cpu(buffer, cpu) { - if (buffer->buffers[cpu]) - rb_free_cpu_buffer(buffer->buffers[cpu]); - } - kfree(buffer->buffers); - - fail_free_cpumask: - free_cpumask_var(buffer->cpumask); - put_online_cpus(); - - fail_free_buffer: - kfree(buffer); - return NULL; -} -EXPORT_SYMBOL_GPL(__ring_buffer_alloc); - -/** - * ring_buffer_free - free a ring buffer. - * @buffer: the buffer to free. - */ -void -ring_buffer_free(struct ring_buffer *buffer) -{ - int cpu; - - get_online_cpus(); - -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); -#endif - - for_each_buffer_cpu(buffer, cpu) - rb_free_cpu_buffer(buffer->buffers[cpu]); - - put_online_cpus(); - - kfree(buffer->buffers); - free_cpumask_var(buffer->cpumask); - - kfree(buffer); -} -EXPORT_SYMBOL_GPL(ring_buffer_free); - -void ring_buffer_set_clock(struct ring_buffer *buffer, - u64 (*clock)(void)) -{ - buffer->clock = clock; -} - -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) -{ - struct buffer_page *bpage; - struct list_head *p; - unsigned i; - - raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); - - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - p = cpu_buffer->pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); - -out: - raw_spin_unlock_irq(&cpu_buffer->reader_lock); -} - -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *pages, unsigned nr_pages) -{ - struct buffer_page *bpage; - struct list_head *p; - unsigned i; - - raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); - - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - goto out; - p = pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - list_add_tail(&bpage->list, cpu_buffer->pages); - } - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); - -out: - raw_spin_unlock_irq(&cpu_buffer->reader_lock); -} - -/** - * ring_buffer_resize - resize the ring buffer - * @buffer: the buffer to resize. - * @size: the new size. - * - * Minimum size is 2 * BUF_PAGE_SIZE. - * - * Returns -1 on failure. - */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *bpage, *tmp; - unsigned long buffer_size; - LIST_HEAD(pages); - int i, cpu; - - /* - * Always succeed at resizing a non-existent buffer: - */ - if (!buffer) - return size; - - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; - buffer_size = buffer->pages * BUF_PAGE_SIZE; - - /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; - - if (size == buffer_size) - return size; - - atomic_inc(&buffer->record_disabled); - - /* Make sure all writers are done with this buffer. */ - synchronize_sched(); - - mutex_lock(&buffer->mutex); - get_online_cpus(); - - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - - if (size < buffer_size) { - - /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) - goto out_fail; - - rm_pages = buffer->pages - nr_pages; - - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - rb_remove_pages(cpu_buffer, rm_pages); - } - goto out; - } - - /* - * This is a bit more difficult. We only want to add pages - * when we can allocate enough for all CPUs. We do this - * by allocating all the pages and storing them on a local - * link list. If we succeed in our allocation, then we - * add these pages to the cpu_buffers. Otherwise we just free - * them all and return -ENOMEM; - */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) - goto out_fail; - - new_pages = nr_pages - buffer->pages; - - for_each_buffer_cpu(buffer, cpu) { - for (i = 0; i < new_pages; i++) { - struct page *page; - /* - * __GFP_NORETRY flag makes sure that the allocation - * fails gracefully without invoking oom-killer and - * the system is not destabilized. - */ - bpage = kzalloc_node(ALIGN(sizeof(*bpage), - cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu)); - if (!bpage) - goto free_pages; - list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); - } - } - - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - rb_insert_pages(cpu_buffer, &pages, new_pages); - } - - if (RB_WARN_ON(buffer, !list_empty(&pages))) - goto out_fail; - - out: - buffer->pages = nr_pages; - put_online_cpus(); - mutex_unlock(&buffer->mutex); - - atomic_dec(&buffer->record_disabled); - - return size; - - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - put_online_cpus(); - mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -ENOMEM; - - /* - * Something went totally wrong, and we are too paranoid - * to even clean up the mess. - */ - out_fail: - put_online_cpus(); - mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -1; -} -EXPORT_SYMBOL_GPL(ring_buffer_resize); - -void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) -{ - mutex_lock(&buffer->mutex); - if (val) - buffer->flags |= RB_FL_OVERWRITE; - else - buffer->flags &= ~RB_FL_OVERWRITE; - mutex_unlock(&buffer->mutex); -} -EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); - -static inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - -static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) -{ - return bpage->page->data + index; -} - -static inline struct ring_buffer_event * -rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->reader_page, - cpu_buffer->reader_page->read); -} - -static inline struct ring_buffer_event * -rb_iter_head_event(struct ring_buffer_iter *iter) -{ - return __rb_page_index(iter->head_page, iter->head); -} - -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ - return local_read(&bpage->write) & RB_WRITE_MASK; -} - -static inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ - return local_read(&bpage->entries) & RB_WRITE_MASK; -} - -/* Size is determined by what has been committed */ -static inline unsigned rb_page_size(struct buffer_page *bpage) -{ - return rb_page_commit(bpage); -} - -static inline unsigned -rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->commit_page); -} - -static inline unsigned -rb_event_index(struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; -} - -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->buffer->pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - -static void rb_inc_iter(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - - /* - * The iterator could be on the reader page (it starts there). - * But the head could have moved, since the reader was - * found. Check for this case and assign the iterator - * to the head page instead of next. - */ - if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = rb_set_head_page(cpu_buffer); - else - rb_inc_page(cpu_buffer, &iter->head_page); - - iter->read_stamp = iter->head_page->page->time_stamp; - iter->head = 0; -} - -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * ring_buffer_update_event - update event type and data - * @event: the even to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - -/* - * rb_handle_head_page - writer hit the head page - * - * Returns: +1 to retry page - * 0 to continue - * -1 on error - */ -static int -rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - struct buffer_page *next_page) -{ - struct buffer_page *new_head; - int entries; - int type; - int ret; - - entries = rb_page_entries(next_page); - - /* - * The hard part is here. We need to move the head - * forward, and protect against both readers on - * other CPUs and writers coming in via interrupts. - */ - type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, - RB_PAGE_HEAD); - - /* - * type can be one of four: - * NORMAL - an interrupt already moved it for us - * HEAD - we are the first to get here. - * UPDATE - we are the interrupt interrupting - * a current move. - * MOVED - a reader on another CPU moved the next - * pointer to its reader page. Give up - * and try again. - */ - - switch (type) { - case RB_PAGE_HEAD: - /* - * We changed the head to UPDATE, thus - * it is our responsibility to update - * the counters. - */ - local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); - - /* - * The entries will be zeroed out when we move the - * tail page. - */ - - /* still more to do */ - break; - - case RB_PAGE_UPDATE: - /* - * This is an interrupt that interrupt the - * previous update. Still more to do. - */ - break; - case RB_PAGE_NORMAL: - /* - * An interrupt came in before the update - * and processed this for us. - * Nothing left to do. - */ - return 1; - case RB_PAGE_MOVED: - /* - * The reader is on another CPU and just did - * a swap with our next_page. - * Try again. - */ - return 1; - default: - RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ - return -1; - } - - /* - * Now that we are here, the old head pointer is - * set to UPDATE. This will keep the reader from - * swapping the head page with the reader page. - * The reader (on another CPU) will spin till - * we are finished. - * - * We just need to protect against interrupts - * doing the job. We will set the next pointer - * to HEAD. After that, we set the old pointer - * to NORMAL, but only if it was HEAD before. - * otherwise we are an interrupt, and only - * want the outer most commit to reset it. - */ - new_head = next_page; - rb_inc_page(cpu_buffer, &new_head); - - ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, - RB_PAGE_NORMAL); - - /* - * Valid returns are: - * HEAD - an interrupt came in and already set it. - * NORMAL - One of two things: - * 1) We really set it. - * 2) A bunch of interrupts came in and moved - * the page forward again. - */ - switch (ret) { - case RB_PAGE_HEAD: - case RB_PAGE_NORMAL: - /* OK */ - break; - default: - RB_WARN_ON(cpu_buffer, 1); - return -1; - } - - /* - * It is possible that an interrupt came in, - * set the head up, then more interrupts came in - * and moved it again. When we get back here, - * the page would have been set to NORMAL but we - * just set it back to HEAD. - * - * How do you detect this? Well, if that happened - * the tail page would have moved. - */ - if (ret == RB_PAGE_NORMAL) { - /* - * If the tail had moved passed next, then we need - * to reset the pointer. - */ - if (cpu_buffer->tail_page != tail_page && - cpu_buffer->tail_page != next_page) - rb_head_page_set_normal(cpu_buffer, new_head, - next_page, - RB_PAGE_HEAD); - } - - /* - * If this was the outer most commit (the one that - * changed the original pointer from HEAD to UPDATE), - * then it is up to us to reset it to NORMAL. - */ - if (type == RB_PAGE_HEAD) { - ret = rb_head_page_set_normal(cpu_buffer, next_page, - tail_page, - RB_PAGE_UPDATE); - if (RB_WARN_ON(cpu_buffer, - ret != RB_PAGE_UPDATE)) - return -1; - } - - return 0; -} - -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length = 1; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - -static inline void -rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) -{ - struct ring_buffer_event *event; - - /* - * Only the event that crossed the page boundary - * must fill the old tail_page with padding. - */ - if (tail >= BUF_PAGE_SIZE) { - /* - * If the page was filled, then we still need - * to update the real_end. Reset it to zero - * and the reader will ignore it. - */ - if (tail == BUF_PAGE_SIZE) - tail_page->real_end = 0; - - local_sub(length, &tail_page->write); - return; - } - - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - - /* - * Save the original length to the meta data. - * This will be used by the reader to add lost event - * counter. - */ - tail_page->real_end = tail; - - /* - * If this event is bigger than the minimum size, then - * we need to be careful that we don't subtract the - * write counter enough to allow another writer to slip - * in on this page. - * We put in a discarded commit instead, to make sure - * that this space is not used again. - * - * If we are less than the minimum size, we don't need to - * worry about it. - */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { - /* No room for any events */ - - /* Mark the rest of the page with padding */ - rb_event_set_padding(event); - - /* Set the write back to the previous setting */ - local_sub(length, &tail_page->write); - return; - } - - /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - event->time_delta = 1; - - /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; - local_sub(length, &tail_page->write); -} - -/* - * This is the slow path, force gcc not to inline it. - */ -static noinline struct ring_buffer_event * -rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) -{ - struct buffer_page *commit_page = cpu_buffer->commit_page; - struct ring_buffer *buffer = cpu_buffer->buffer; - struct buffer_page *next_page; - int ret; - - next_page = tail_page; - - rb_inc_page(cpu_buffer, &next_page); - - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - local_inc(&cpu_buffer->commit_overrun); - goto out_reset; - } - - /* - * This is where the fun begins! - * - * We are fighting against races between a reader that - * could be on another CPU trying to swap its reader - * page with the buffer head. - * - * We are also fighting against interrupts coming in and - * moving the head or tail on us as well. - * - * If the next page is the head page then we have filled - * the buffer, unless the commit page is still on the - * reader page. - */ - if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - - /* - * If the commit is not on the reader page, then - * move the header page. - */ - if (!rb_is_reader_page(cpu_buffer->commit_page)) { - /* - * If we are not in overwrite mode, - * this is easy, just stop here. - */ - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - ret = rb_handle_head_page(cpu_buffer, - tail_page, - next_page); - if (ret < 0) - goto out_reset; - if (ret) - goto out_again; - } else { - /* - * We need to be careful here too. The - * commit page could still be on the reader - * page. We could have a small buffer, and - * have filled up the buffer with events - * from interrupts and such, and wrapped. - * - * Note, if the tail page is also the on the - * reader_page, we let it move out. - */ - if (unlikely((cpu_buffer->commit_page != - cpu_buffer->tail_page) && - (cpu_buffer->commit_page == - cpu_buffer->reader_page))) { - local_inc(&cpu_buffer->commit_overrun); - goto out_reset; - } - } - } - - ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); - if (ret) { - /* - * Nested commits always have zero deltas, so - * just reread the time stamp - */ - ts = rb_time_stamp(buffer); - next_page->page->time_stamp = ts; - } - - out_again: - - rb_reset_tail(cpu_buffer, tail_page, tail, length); - - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - - out_reset: - /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); - - return NULL; -} - -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) -{ - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; - - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; - - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); - - /* We reserved something on the buffer */ - - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - - local_inc(&tail_page->entries); - - /* - * If this is the first commit on the page, then update - * its timestamp. - */ - if (!tail) - tail_page->page->time_stamp = ts; - - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); - - return event; -} - -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; - - new_index = rb_event_index(event); - old_index = new_index + rb_event_ts_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - unsigned long write_mask = - local_read(&bpage->write) & ~RB_WRITE_MASK; - unsigned long event_length = rb_event_length(event); - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - old_index += write_mask; - new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { - /* update counters */ - local_sub(event_length, &cpu_buffer->entries_bytes); - return 1; - } - } - - /* could not discard */ - return 0; -} - -static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - local_inc(&cpu_buffer->committing); - local_inc(&cpu_buffer->commits); -} - -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long commits; - - if (RB_WARN_ON(cpu_buffer, - !local_read(&cpu_buffer->committing))) - return; - - again: - commits = local_read(&cpu_buffer->commits); - /* synchronize with interrupts */ - barrier(); - if (local_read(&cpu_buffer->committing) == 1) - rb_set_commit_to_write(cpu_buffer); - - local_dec(&cpu_buffer->committing); - - /* synchronize with interrupts */ - barrier(); - - /* - * Need to account for interrupts coming in between the - * updating of the commit page and the clearing of the - * committing counter. - */ - if (unlikely(local_read(&cpu_buffer->commits) != commits) && - !local_read(&cpu_buffer->committing)) { - local_inc(&cpu_buffer->committing); - goto again; - } -} - -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) -{ - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; - - rb_start_commit(cpu_buffer); - -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif - - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; - - /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! - */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; - - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } - } - - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; - - return event; - - out_fail: - rb_end_commit(cpu_buffer); - return NULL; -} - -#ifdef CONFIG_TRACING - -#define TRACE_RECURSIVE_DEPTH 16 - -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) -{ - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); - - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); - - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; - - trace_recursive_fail(); - - return -1; -} - -static inline void trace_recursive_unlock(void) -{ - WARN_ON_ONCE(!trace_recursion_buffer()); - - trace_recursion_dec(); -} - -#else - -#define trace_recursive_lock() (0) -#define trace_recursive_unlock() do { } while (0) - -#endif - -/** - * ring_buffer_lock_reserve - reserve a part of the buffer - * @buffer: the ring buffer to reserve from - * @length: the length of the data to reserve (excluding event header) - * - * Returns a reseverd event on the ring buffer to copy directly to. - * The user of this interface will need to get the body to write into - * and can use the ring_buffer_event_data() interface. - * - * The length is the length of the data needed, not the event length - * which also includes the event header. - * - * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. - * If NULL is returned, then nothing has been allocated or locked. - */ -struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - int cpu; - - if (ring_buffer_flags != RB_BUFFERS_ON) - return NULL; - - /* If we are tracing schedule, we don't want to recurse */ - preempt_disable_notrace(); - - if (atomic_read(&buffer->record_disabled)) - goto out_nocheck; - - if (trace_recursive_lock()) - goto out_nocheck; - - cpu = raw_smp_processor_id(); - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; - - cpu_buffer = buffer->buffers[cpu]; - - if (atomic_read(&cpu_buffer->record_disabled)) - goto out; - - if (length > BUF_MAX_DATA_SIZE) - goto out; - - event = rb_reserve_next_event(buffer, cpu_buffer, length); - if (!event) - goto out; - - return event; - - out: - trace_recursive_unlock(); - - out_nocheck: - preempt_enable_notrace(); - return NULL; -} -EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); - -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - trace_recursive_unlock(); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - -/* - * Decrement the entries to the page that an event is on. - * The event does not even need to exist, only the pointer - * to the page it is on. This may only be called before the commit - * takes place. - */ -static inline void -rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - struct buffer_page *bpage = cpu_buffer->commit_page; - struct buffer_page *start; - - addr &= PAGE_MASK; - - /* Do the likely case first */ - if (likely(bpage->page == (void *)addr)) { - local_dec(&bpage->entries); - return; - } - - /* - * Because the commit page may be on the reader page we - * start with the next page and check the end loop there. - */ - rb_inc_page(cpu_buffer, &bpage); - start = bpage; - do { - if (bpage->page == (void *)addr) { - local_dec(&bpage->entries); - return; - } - rb_inc_page(cpu_buffer, &bpage); - } while (bpage != start); - - /* commit not part of this buffer?? */ - RB_WARN_ON(cpu_buffer, 1); -} - -/** - * ring_buffer_commit_discard - discard an event that has not been committed - * @buffer: the ring buffer - * @event: non committed event to discard - * - * Sometimes an event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * This function only works if it is called before the the item has been - * committed. It will try to free the event from the ring buffer - * if another event has not been added behind it. - * - * If another event has been added behind it, it will set the event - * up as discarded, and perform the commit. - * - * If this function is called, do not call ring_buffer_unlock_commit on - * the event. - */ -void ring_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu; - - /* The event is discarded regardless */ - rb_event_discard(event); - - cpu = smp_processor_id(); - cpu_buffer = buffer->buffers[cpu]; - - /* - * This must only be called if the event has not been - * committed yet. Thus we can assume that preemption - * is still disabled. - */ - RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - - rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - /* - * The commit is still visible by the reader, so we - * must still update the timestamp. - */ - rb_update_write_stamp(cpu_buffer, event); - out: - rb_end_commit(cpu_buffer); - - trace_recursive_unlock(); - - preempt_enable_notrace(); - -} -EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); - -/** - * ring_buffer_write - write data to the buffer without reserving - * @buffer: The ring buffer to write to. - * @length: The length of the data being written (excluding the event header) - * @data: The data to write to the buffer. - * - * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as - * one function. If you already have the data to write to the buffer, it - * may be easier to simply call this function. - * - * Note, like ring_buffer_lock_reserve, the length is the length of the data - * and not the length of the event which would hold the header. - */ -int ring_buffer_write(struct ring_buffer *buffer, - unsigned long length, - void *data) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - void *body; - int ret = -EBUSY; - int cpu; - - if (ring_buffer_flags != RB_BUFFERS_ON) - return -EBUSY; - - preempt_disable_notrace(); - - if (atomic_read(&buffer->record_disabled)) - goto out; - - cpu = raw_smp_processor_id(); - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; - - cpu_buffer = buffer->buffers[cpu]; - - if (atomic_read(&cpu_buffer->record_disabled)) - goto out; - - if (length > BUF_MAX_DATA_SIZE) - goto out; - - event = rb_reserve_next_event(buffer, cpu_buffer, length); - if (!event) - goto out; - - body = rb_event_data(event); - - memcpy(body, data, length); - - rb_commit(cpu_buffer, event); - - ret = 0; - out: - preempt_enable_notrace(); - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_write); - -static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = rb_set_head_page(cpu_buffer); - struct buffer_page *commit = cpu_buffer->commit_page; - - /* In case of error, head will be NULL */ - if (unlikely(!head)) - return 1; - - return reader->read == rb_page_commit(reader) && - (commit == reader || - (commit == head && - head->read == rb_page_commit(commit))); -} - -/** - * ring_buffer_record_disable - stop all writes into the buffer - * @buffer: The ring buffer to stop writes to. - * - * This prevents all writes to the buffer. Any attempt to write - * to the buffer after this will fail and return NULL. - * - * The caller should call synchronize_sched() after this. - */ -void ring_buffer_record_disable(struct ring_buffer *buffer) -{ - atomic_inc(&buffer->record_disabled); -} -EXPORT_SYMBOL_GPL(ring_buffer_record_disable); - -/** - * ring_buffer_record_enable - enable writes to the buffer - * @buffer: The ring buffer to enable writes - * - * Note, multiple disables will need the same number of enables - * to truly enable the writing (much like preempt_disable). - */ -void ring_buffer_record_enable(struct ring_buffer *buffer) -{ - atomic_dec(&buffer->record_disabled); -} -EXPORT_SYMBOL_GPL(ring_buffer_record_enable); - -/** - * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer - * @buffer: The ring buffer to stop writes to. - * @cpu: The CPU buffer to stop - * - * This prevents all writes to the buffer. Any attempt to write - * to the buffer after this will fail and return NULL. - * - * The caller should call synchronize_sched() after this. - */ -void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; - - cpu_buffer = buffer->buffers[cpu]; - atomic_inc(&cpu_buffer->record_disabled); -} -EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); - -/** - * ring_buffer_record_enable_cpu - enable writes to the buffer - * @buffer: The ring buffer to enable writes - * @cpu: The CPU to enable. - * - * Note, multiple disables will need the same number of enables - * to truly enable the writing (much like preempt_disable). - */ -void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; - - cpu_buffer = buffer->buffers[cpu]; - atomic_dec(&cpu_buffer->record_disabled); -} -EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); - -/* - * The total entries in the ring buffer is the running counter - * of entries entered into the ring buffer, minus the sum of - * the entries read from the ring buffer and the number of - * entries that were overwritten. - */ -static inline unsigned long -rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) -{ - return local_read(&cpu_buffer->entries) - - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); -} - -/** - * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer - * @buffer: The ring buffer - * @cpu: The per CPU buffer to read from. - */ -unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) -{ - unsigned long flags; - struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *bpage; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - /* - * if the tail is on reader_page, oldest time stamp is on the reader - * page - */ - if (cpu_buffer->tail_page == cpu_buffer->reader_page) - bpage = cpu_buffer->reader_page; - else - bpage = rb_set_head_page(cpu_buffer); - ret = bpage->page->time_stamp; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); - -/** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer - * @buffer: The ring buffer - * @cpu: The per CPU buffer to read from. - */ -unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); - -/** - * ring_buffer_entries_cpu - get the number of entries in a cpu buffer - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the entries from. - */ -unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - - return rb_num_of_entries(cpu_buffer); -} -EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); - -/** - * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = local_read(&cpu_buffer->overrun); - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); - -/** - * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long -ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = local_read(&cpu_buffer->commit_overrun); - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); - -/** - * ring_buffer_entries - get the number of entries in a buffer - * @buffer: The ring buffer - * - * Returns the total number of entries in the ring buffer - * (all CPU entries) - */ -unsigned long ring_buffer_entries(struct ring_buffer *buffer) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long entries = 0; - int cpu; - - /* if you care about this being correct, lock the buffer */ - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - entries += rb_num_of_entries(cpu_buffer); - } - - return entries; -} -EXPORT_SYMBOL_GPL(ring_buffer_entries); - -/** - * ring_buffer_overruns - get the number of overruns in buffer - * @buffer: The ring buffer - * - * Returns the total number of overruns in the ring buffer - * (all CPU entries) - */ -unsigned long ring_buffer_overruns(struct ring_buffer *buffer) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long overruns = 0; - int cpu; - - /* if you care about this being correct, lock the buffer */ - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - overruns += local_read(&cpu_buffer->overrun); - } - - return overruns; -} -EXPORT_SYMBOL_GPL(ring_buffer_overruns); - -static void rb_iter_reset(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - - /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } - if (iter->head) - iter->read_stamp = cpu_buffer->read_stamp; - else - iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; -} - -/** - * ring_buffer_iter_reset - reset an iterator - * @iter: The iterator to reset - * - * Resets the iterator, so that it will start from the beginning - * again. - */ -void ring_buffer_iter_reset(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - rb_iter_reset(iter); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -} -EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); - -/** - * ring_buffer_iter_empty - check if an iterator has no more to read - * @iter: The iterator to check - */ -int ring_buffer_iter_empty(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - - cpu_buffer = iter->cpu_buffer; - - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); -} -EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); - -static void -rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - return; - - case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->read_stamp += delta; - return; - - case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ - return; - - case RINGBUF_TYPE_DATA: - cpu_buffer->read_stamp += event->time_delta; - return; - - default: - BUG(); - } - return; -} - -static void -rb_update_iter_read_stamp(struct ring_buffer_iter *iter, - struct ring_buffer_event *event) -{ - u64 delta; - - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - return; - - case RINGBUF_TYPE_TIME_EXTEND: - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - iter->read_stamp += delta; - return; - - case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ - return; - - case RINGBUF_TYPE_DATA: - iter->read_stamp += event->time_delta; - return; - - default: - BUG(); - } - return; -} - -static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct buffer_page *reader = NULL; - unsigned long overwrite; - unsigned long flags; - int nr_loops = 0; - int ret; - - local_irq_save(flags); - arch_spin_lock(&cpu_buffer->lock); - - again: - /* - * This should normally only loop twice. But because the - * start of the reader inserts an empty page, it causes - * a case where we will loop three times. There should be no - * reason to loop four times (that I know of). - */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { - reader = NULL; - goto out; - } - - reader = cpu_buffer->reader_page; - - /* If there's more to read, return this page */ - if (cpu_buffer->reader_page->read < rb_page_size(reader)) - goto out; - - /* Never should we have an index greater than the size */ - if (RB_WARN_ON(cpu_buffer, - cpu_buffer->reader_page->read > rb_page_size(reader))) - goto out; - - /* check if we caught up to the tail */ - reader = NULL; - if (cpu_buffer->commit_page == cpu_buffer->reader_page) - goto out; - - /* - * Reset the reader page to size zero. - */ - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); - cpu_buffer->reader_page->real_end = 0; - - spin: - /* - * Splice the empty reader page into the list around the head. - */ - reader = rb_set_head_page(cpu_buffer); - cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); - cpu_buffer->reader_page->list.prev = reader->list.prev; - - /* - * cpu_buffer->pages just needs to point to the buffer, it - * has no specific buffer page to point to. Lets move it out - * of our way so we don't accidentally swap it. - */ - cpu_buffer->pages = reader->list.prev; - - /* The reader page will be pointing to the new head */ - rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); - - /* - * We want to make sure we read the overruns after we set up our - * pointers to the next object. The writer side does a - * cmpxchg to cross pages which acts as the mb on the writer - * side. Note, the reader will constantly fail the swap - * while the writer is updating the pointers, so this - * guarantees that the overwrite recorded here is the one we - * want to compare with the last_overrun. - */ - smp_mb(); - overwrite = local_read(&(cpu_buffer->overrun)); - - /* - * Here's the tricky part. - * - * We need to move the pointer past the header page. - * But we can only do that if a writer is not currently - * moving it. The page before the header page has the - * flag bit '1' set if it is pointing to the page we want. - * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. - */ - - ret = rb_head_page_replace(reader, cpu_buffer->reader_page); - - /* - * If we did not convert it, then we must try again. - */ - if (!ret) - goto spin; - - /* - * Yeah! We succeeded in replacing the page. - * - * Now make the new head point back to the reader page. - */ - rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); - - /* Finally update the reader page to the new head */ - cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); - - if (overwrite != cpu_buffer->last_overrun) { - cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; - cpu_buffer->last_overrun = overwrite; - } - - goto again; - - out: - arch_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - - return reader; -} - -static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - struct buffer_page *reader; - unsigned length; - - reader = rb_get_reader_page(cpu_buffer); - - /* This function should not be called when buffer is empty */ - if (RB_WARN_ON(cpu_buffer, !reader)) - return; - - event = rb_reader_event(cpu_buffer); - - if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - cpu_buffer->read++; - - rb_update_read_stamp(cpu_buffer, event); - - length = rb_event_length(event); - cpu_buffer->reader_page->read += length; -} - -static void rb_advance_iter(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - unsigned length; - - cpu_buffer = iter->cpu_buffer; - - /* - * Check if we are at the end of the buffer. - */ - if (iter->head >= rb_page_size(iter->head_page)) { - /* discarded commits can make the page empty */ - if (iter->head_page == cpu_buffer->commit_page) - return; - rb_inc_iter(iter); - return; - } - - event = rb_iter_head_event(iter); - - length = rb_event_length(event); - - /* - * This should not be called to advance the header if we are - * at the tail of the buffer. - */ - if (RB_WARN_ON(cpu_buffer, - (iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer)))) - return; - - rb_update_iter_read_stamp(iter, event); - - iter->head += length; - - /* check for end of page padding */ - if ((iter->head >= rb_page_size(iter->head_page)) && - (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); -} - -static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) -{ - return cpu_buffer->lost_events; -} - -static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, - unsigned long *lost_events) -{ - struct ring_buffer_event *event; - struct buffer_page *reader; - int nr_loops = 0; - - again: - /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). - */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) - return NULL; - - reader = rb_get_reader_page(cpu_buffer); - if (!reader) - return NULL; - - event = rb_reader_event(cpu_buffer); - - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - if (rb_null_event(event)) - RB_WARN_ON(cpu_buffer, 1); - /* - * Because the writer could be discarding every - * event it creates (which would probably be bad) - * if we were to go back to "again" then we may never - * catch up, and will trigger the warn on, or lock - * the box. Return the padding, and we will release - * the current locks, and try again. - */ - return event; - - case RINGBUF_TYPE_TIME_EXTEND: - /* Internal data, OK to advance */ - rb_advance_reader(cpu_buffer); - goto again; - - case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ - rb_advance_reader(cpu_buffer); - goto again; - - case RINGBUF_TYPE_DATA: - if (ts) { - *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->buffer, - cpu_buffer->cpu, ts); - } - if (lost_events) - *lost_events = rb_lost_events(cpu_buffer); - return event; - - default: - BUG(); - } - - return NULL; -} -EXPORT_SYMBOL_GPL(ring_buffer_peek); - -static struct ring_buffer_event * -rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) -{ - struct ring_buffer *buffer; - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event; - int nr_loops = 0; - - cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; - - /* - * Check if someone performed a consuming read to - * the buffer. A consuming read invalidates the iterator - * and we need to reset the iterator in this case. - */ - if (unlikely(iter->cache_read != cpu_buffer->read || - iter->cache_reader_page != cpu_buffer->reader_page)) - rb_iter_reset(iter); - - again: - if (ring_buffer_iter_empty(iter)) - return NULL; - - /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). - */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) - return NULL; - - if (rb_per_cpu_empty(cpu_buffer)) - return NULL; - - if (iter->head >= local_read(&iter->head_page->page->commit)) { - rb_inc_iter(iter); - goto again; - } - - event = rb_iter_head_event(iter); - - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - if (rb_null_event(event)) { - rb_inc_iter(iter); - goto again; - } - rb_advance_iter(iter); - return event; - - case RINGBUF_TYPE_TIME_EXTEND: - /* Internal data, OK to advance */ - rb_advance_iter(iter); - goto again; - - case RINGBUF_TYPE_TIME_STAMP: - /* FIXME: not implemented */ - rb_advance_iter(iter); - goto again; - - case RINGBUF_TYPE_DATA: - if (ts) { - *ts = iter->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, - cpu_buffer->cpu, ts); - } - return event; - - default: - BUG(); - } - - return NULL; -} -EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); - -static inline int rb_ok_to_lock(void) -{ - /* - * If an NMI die dumps out the content of the ring buffer - * do not grab locks. We also permanently disable the ring - * buffer too. A one time deal is all you get from reading - * the ring buffer from an NMI. - */ - if (likely(!in_nmi())) - return 1; - - tracing_off_permanent(); - return 0; -} - -/** - * ring_buffer_peek - peek at the next event to be read - * @buffer: The ring buffer to read - * @cpu: The cpu to peak at - * @ts: The timestamp counter of this event. - * @lost_events: a variable to store if events were lost (may be NULL) - * - * This will return the event that will be read next, but does - * not consume the data. - */ -struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, - unsigned long *lost_events) -{ - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; - unsigned long flags; - int dolock; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; - - dolock = rb_ok_to_lock(); - again: - local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(cpu_buffer, ts, lost_events); - if (event && event->type_len == RINGBUF_TYPE_PADDING) - rb_advance_reader(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); - - if (event && event->type_len == RINGBUF_TYPE_PADDING) - goto again; - - return event; -} - -/** - * ring_buffer_iter_peek - peek at the next event to be read - * @iter: The ring buffer iterator - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not increment the iterator. - */ -struct ring_buffer_event * -ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) -{ - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - struct ring_buffer_event *event; - unsigned long flags; - - again: - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - event = rb_iter_peek(iter, ts); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - if (event && event->type_len == RINGBUF_TYPE_PADDING) - goto again; - - return event; -} - -/** - * ring_buffer_consume - return an event and consume it - * @buffer: The ring buffer to get the next event from - * @cpu: the cpu to read the buffer from - * @ts: a variable to store the timestamp (may be NULL) - * @lost_events: a variable to store if events were lost (may be NULL) - * - * Returns the next event in the ring buffer, and that event is consumed. - * Meaning, that sequential reads will keep returning a different event, - * and eventually empty the ring buffer if the producer is slower. - */ -struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, - unsigned long *lost_events) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_event *event = NULL; - unsigned long flags; - int dolock; - - dolock = rb_ok_to_lock(); - - again: - /* might be called in atomic */ - preempt_disable(); - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; - - cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); - - event = rb_buffer_peek(cpu_buffer, ts, lost_events); - if (event) { - cpu_buffer->lost_events = 0; - rb_advance_reader(cpu_buffer); - } - - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); - - out: - preempt_enable(); - - if (event && event->type_len == RINGBUF_TYPE_PADDING) - goto again; - - return event; -} -EXPORT_SYMBOL_GPL(ring_buffer_consume); - -/** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer - * @buffer: The ring buffer to read from - * @cpu: The cpu buffer to iterate over - * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer recording - * is disabled, and the iterator pointer is returned to the caller. - * - * Disabling buffer recordng prevents the reading from being - * corrupted. This is not a consuming read, so a producer is not - * expected. - * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. - * - * This overall must be paired with ring_buffer_finish. - */ -struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_iter *iter; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return NULL; - - cpu_buffer = buffer->buffers[cpu]; - - iter->cpu_buffer = cpu_buffer; - - atomic_inc(&cpu_buffer->record_disabled); - - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_sched(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - arch_spin_lock(&cpu_buffer->lock); - rb_iter_reset(iter); - arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_start); - -/** - * ring_buffer_finish - finish reading the iterator of the buffer - * @iter: The iterator retrieved by ring_buffer_start - * - * This re-enables the recording to the buffer, and frees the - * iterator. - */ -void -ring_buffer_read_finish(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - - atomic_dec(&cpu_buffer->record_disabled); - kfree(iter); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_finish); - -/** - * ring_buffer_read - read the next item in the ring buffer by the iterator - * @iter: The ring buffer iterator - * @ts: The time stamp of the event read. - * - * This reads the next event in the ring buffer and increments the iterator. - */ -struct ring_buffer_event * -ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) -{ - struct ring_buffer_event *event; - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - again: - event = rb_iter_peek(iter, ts); - if (!event) - goto out; - - if (event->type_len == RINGBUF_TYPE_PADDING) - goto again; - - rb_advance_iter(iter); - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - return event; -} -EXPORT_SYMBOL_GPL(ring_buffer_read); - -/** - * ring_buffer_size - return the size of the ring buffer (in bytes) - * @buffer: The ring buffer. - */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) -{ - return BUF_PAGE_SIZE * buffer->pages; -} -EXPORT_SYMBOL_GPL(ring_buffer_size); - -static void -rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) -{ - rb_head_page_deactivate(cpu_buffer); - - cpu_buffer->head_page - = list_entry(cpu_buffer->pages, struct buffer_page, list); - local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->entries, 0); - local_set(&cpu_buffer->head_page->page->commit, 0); - - cpu_buffer->head_page->read = 0; - - cpu_buffer->tail_page = cpu_buffer->head_page; - cpu_buffer->commit_page = cpu_buffer->head_page; - - INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); - cpu_buffer->reader_page->read = 0; - - local_set(&cpu_buffer->commit_overrun, 0); - local_set(&cpu_buffer->entries_bytes, 0); - local_set(&cpu_buffer->overrun, 0); - local_set(&cpu_buffer->entries, 0); - local_set(&cpu_buffer->committing, 0); - local_set(&cpu_buffer->commits, 0); - cpu_buffer->read = 0; - cpu_buffer->read_bytes = 0; - - cpu_buffer->write_stamp = 0; - cpu_buffer->read_stamp = 0; - - cpu_buffer->lost_events = 0; - cpu_buffer->last_overrun = 0; - - rb_head_page_activate(cpu_buffer); -} - -/** - * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer - * @buffer: The ring buffer to reset a per cpu buffer of - * @cpu: The CPU buffer to be reset - */ -void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - unsigned long flags; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; - - atomic_inc(&cpu_buffer->record_disabled); - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - - if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; - - arch_spin_lock(&cpu_buffer->lock); - - rb_reset_cpu(cpu_buffer); - - arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - atomic_dec(&cpu_buffer->record_disabled); -} -EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); - -/** - * ring_buffer_reset - reset a ring buffer - * @buffer: The ring buffer to reset all cpu buffers - */ -void ring_buffer_reset(struct ring_buffer *buffer) -{ - int cpu; - - for_each_buffer_cpu(buffer, cpu) - ring_buffer_reset_cpu(buffer, cpu); -} -EXPORT_SYMBOL_GPL(ring_buffer_reset); - -/** - * rind_buffer_empty - is the ring buffer empty? - * @buffer: The ring buffer to test - */ -int ring_buffer_empty(struct ring_buffer *buffer) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - int dolock; - int cpu; - int ret; - - dolock = rb_ok_to_lock(); - - /* yes this is racy, but if you don't like the race, lock the buffer */ - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); - ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); - - if (!ret) - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(ring_buffer_empty); - -/** - * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? - * @buffer: The ring buffer - * @cpu: The CPU buffer to test - */ -int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - int dolock; - int ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; - - dolock = rb_ok_to_lock(); - - cpu_buffer = buffer->buffers[cpu]; - local_irq_save(flags); - if (dolock) - raw_spin_lock(&cpu_buffer->reader_lock); - ret = rb_per_cpu_empty(cpu_buffer); - if (dolock) - raw_spin_unlock(&cpu_buffer->reader_lock); - local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); - -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP -/** - * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers - * @buffer_a: One buffer to swap with - * @buffer_b: The other buffer to swap with - * - * This function is useful for tracers that want to take a "snapshot" - * of a CPU buffer and has another back up buffer lying around. - * it is expected that the tracer handles the cpu buffer not being - * used at the moment. - */ -int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, - struct ring_buffer *buffer_b, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer_a; - struct ring_buffer_per_cpu *cpu_buffer_b; - int ret = -EINVAL; - - if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || - !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; - - /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->pages != buffer_b->pages) - goto out; - - ret = -EAGAIN; - - if (ring_buffer_flags != RB_BUFFERS_ON) - goto out; - - if (atomic_read(&buffer_a->record_disabled)) - goto out; - - if (atomic_read(&buffer_b->record_disabled)) - goto out; - - cpu_buffer_a = buffer_a->buffers[cpu]; - cpu_buffer_b = buffer_b->buffers[cpu]; - - if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; - - if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; - - /* - * We can't do a synchronize_sched here because this - * function can be called in atomic context. - * Normally this will be called from the same CPU as cpu. - * If not it's up to the caller to protect this. - */ - atomic_inc(&cpu_buffer_a->record_disabled); - atomic_inc(&cpu_buffer_b->record_disabled); - - ret = -EBUSY; - if (local_read(&cpu_buffer_a->committing)) - goto out_dec; - if (local_read(&cpu_buffer_b->committing)) - goto out_dec; - - buffer_a->buffers[cpu] = cpu_buffer_b; - buffer_b->buffers[cpu] = cpu_buffer_a; - - cpu_buffer_b->buffer = buffer_a; - cpu_buffer_a->buffer = buffer_b; - - ret = 0; - -out_dec: - atomic_dec(&cpu_buffer_a->record_disabled); - atomic_dec(&cpu_buffer_b->record_disabled); -out: - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ - -/** - * ring_buffer_alloc_read_page - allocate a page to read from buffer - * @buffer: the buffer to allocate for. - * - * This function is used in conjunction with ring_buffer_read_page. - * When reading a full page from the ring buffer, these functions - * can be used to speed up the process. The calling function should - * allocate a few pages first with this function. Then when it - * needs to get pages from the ring buffer, it passes the result - * of this function into ring_buffer_read_page, which will swap - * the page that was allocated, with the read page of the buffer. - * - * Returns: - * The page allocated, or NULL on error. - */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) -{ - struct buffer_data_page *bpage; - struct page *page; - - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - return NULL; - - bpage = page_address(page); - - rb_init_page(bpage); - - return bpage; -} -EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); - -/** - * ring_buffer_free_read_page - free an allocated read page - * @buffer: the buffer the page was allocate for - * @data: the page to free - * - * Free a page allocated from ring_buffer_alloc_read_page. - */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) -{ - free_page((unsigned long)data); -} -EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); - -/** - * ring_buffer_read_page - extract a page from the ring buffer - * @buffer: buffer to extract from - * @data_page: the page to use allocated from ring_buffer_alloc_read_page - * @len: amount to extract - * @cpu: the cpu of the buffer to extract - * @full: should the extraction only happen when the page is full. - * - * This function will pull out a page from the ring buffer and consume it. - * @data_page must be the address of the variable that was returned - * from ring_buffer_alloc_read_page. This is because the page might be used - * to swap with a page in the ring buffer. - * - * for example: - * rpage = ring_buffer_alloc_read_page(buffer); - * if (!rpage) - * return error; - * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); - * if (ret >= 0) - * process_page(rpage, ret); - * - * When @full is set, the function will not return true unless - * the writer is off the reader page. - * - * Note: it is up to the calling functions to handle sleeps and wakeups. - * The ring buffer can be used anywhere in the kernel and can not - * blindly call wake_up. The layer that uses the ring buffer must be - * responsible for that. - * - * Returns: - * >=0 if data has been transferred, returns the offset of consumed data. - * <0 if no data has been transferred. - */ -int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, size_t len, int cpu, int full) -{ - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; - struct buffer_data_page *bpage; - struct buffer_page *reader; - unsigned long missed_events; - unsigned long flags; - unsigned int commit; - unsigned int read; - u64 save_timestamp; - int ret = -1; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; - - /* - * If len is not big enough to hold the page header, then - * we can not copy anything. - */ - if (len <= BUF_PAGE_HDR_SIZE) - goto out; - - len -= BUF_PAGE_HDR_SIZE; - - if (!data_page) - goto out; - - bpage = *data_page; - if (!bpage) - goto out; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - - reader = rb_get_reader_page(cpu_buffer); - if (!reader) - goto out_unlock; - - event = rb_reader_event(cpu_buffer); - - read = reader->read; - commit = rb_page_commit(reader); - - /* Check if any events were dropped */ - missed_events = cpu_buffer->lost_events; - - /* - * If this page has been partially read or - * if len is not big enough to read the rest of the page or - * a writer is still on the page, then - * we must copy the data from the page to the buffer. - * Otherwise, we can simply swap the page with the one passed in. - */ - if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { - struct buffer_data_page *rpage = cpu_buffer->reader_page->page; - unsigned int rpos = read; - unsigned int pos = 0; - unsigned int size; - - if (full) - goto out_unlock; - - if (len > (commit - read)) - len = (commit - read); - - /* Always keep the time extend and data together */ - size = rb_event_ts_length(event); - - if (len < size) - goto out_unlock; - - /* save the current timestamp, since the user will need it */ - save_timestamp = cpu_buffer->read_stamp; - - /* Need to copy one event at a time */ - do { - /* We need the size of one event, because - * rb_advance_reader only advances by one event, - * whereas rb_event_ts_length may include the size of - * one or two events. - * We have already ensured there's enough space if this - * is a time extend. */ - size = rb_event_length(event); - memcpy(bpage->data + pos, rpage->data + rpos, size); - - len -= size; - - rb_advance_reader(cpu_buffer); - rpos = reader->read; - pos += size; - - if (rpos >= commit) - break; - - event = rb_reader_event(cpu_buffer); - /* Always keep the time extend and data together */ - size = rb_event_ts_length(event); - } while (len >= size); - - /* update bpage */ - local_set(&bpage->commit, pos); - bpage->time_stamp = save_timestamp; - - /* we copied everything to the beginning */ - read = 0; - } else { - /* update the entry counter */ - cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; - - /* swap the pages */ - rb_init_page(bpage); - bpage = reader->page; - reader->page = *data_page; - local_set(&reader->write, 0); - local_set(&reader->entries, 0); - reader->read = 0; - *data_page = bpage; - - /* - * Use the real_end for the data size, - * This gives us a chance to store the lost events - * on the page. - */ - if (reader->real_end) - local_set(&bpage->commit, reader->real_end); - } - ret = read; - - cpu_buffer->lost_events = 0; - - commit = local_read(&bpage->commit); - /* - * Set a flag in the commit field if we lost events - */ - if (missed_events) { - /* If there is room at the end of the page to save the - * missed events, then record it there. - */ - if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { - memcpy(&bpage->data[commit], &missed_events, - sizeof(missed_events)); - local_add(RB_MISSED_STORED, &bpage->commit); - commit += sizeof(missed_events); - } - local_add(RB_MISSED_EVENTS, &bpage->commit); - } - - /* - * This page may be off to user land. Zero it out here. - */ - if (commit < BUF_PAGE_SIZE) - memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); - - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_page); - -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) - r = sprintf(buf, "permanently disabled\n"); - else - r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val) - set_bit(RB_BUFFERS_ON_BIT, p); - else - clear_bit(RB_BUFFERS_ON_BIT, p); - - (*ppos)++; - - return cnt; -} - -static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, - .read = rb_simple_read, - .write = rb_simple_write, - .llseek = default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - - return 0; -} - -fs_initcall(rb_init_debugfs); -#endif - -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct ring_buffer *buffer = - container_of(self, struct ring_buffer, cpu_notify); - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (cpumask_test_cpu(cpu, buffer->cpumask)) - return NOTIFY_OK; - - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); - if (!buffer->buffers[cpu]) { - WARN(1, "failed to allocate ring buffer on CPU %ld\n", - cpu); - return NOTIFY_OK; - } - smp_wmb(); - cpumask_set_cpu(cpu, buffer->cpumask); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Do nothing. - * If we were to free the buffer, then the user would - * lose any trace that was in the buffer. - */ - break; - default: - break; - } - return NOTIFY_OK; -} -#endif -/* - * ring buffer tester and benchmark - * - * Copyright (C) 2009 Steven Rostedt - */ -#include -#include -#include -#include -#include -#include - -struct rb_page { - u64 ts; - local_t commit; - char data[4080]; -}; - -/* run time and sleep time in seconds */ -#define RUN_TIME 10 -#define SLEEP_TIME 10 - -/* number of events for writer to wake up the reader */ -static int wakeup_interval = 100; - -static int reader_finish; -static struct completion read_start; -static struct completion read_done; - -static struct ring_buffer *buffer; -static struct task_struct *producer; -static struct task_struct *consumer; -static unsigned long read; - -static int disable_reader; -module_param(disable_reader, uint, 0644); -MODULE_PARM_DESC(disable_reader, "only run producer"); - -static int write_iteration = 50; -module_param(write_iteration, uint, 0644); -MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); - -static int producer_nice = 19; -static int consumer_nice = 19; - -static int producer_fifo = -1; -static int consumer_fifo = -1; - -module_param(producer_nice, uint, 0644); -MODULE_PARM_DESC(producer_nice, "nice prio for producer"); - -module_param(consumer_nice, uint, 0644); -MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); - -module_param(producer_fifo, uint, 0644); -MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); - -module_param(consumer_fifo, uint, 0644); -MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); - -static int read_events; - -static int kill_test; - -#define KILL_TEST() \ - do { \ - if (!kill_test) { \ - kill_test = 1; \ - WARN_ON(1); \ - } \ - } while (0) - -enum event_status { - EVENT_FOUND, - EVENT_DROPPED, -}; - -static enum event_status read_event(int cpu) -{ - struct ring_buffer_event *event; - int *entry; - u64 ts; - - event = ring_buffer_consume(buffer, cpu, &ts, NULL); - if (!event) - return EVENT_DROPPED; - - entry = ring_buffer_event_data(event); - if (*entry != cpu) { - KILL_TEST(); - return EVENT_DROPPED; - } - - read++; - return EVENT_FOUND; -} - -static enum event_status read_page(int cpu) -{ - struct ring_buffer_event *event; - struct rb_page *rpage; - unsigned long commit; - void *bpage; - int *entry; - int ret; - int inc; - int i; - - bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) - return EVENT_DROPPED; - - ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); - if (ret >= 0) { - rpage = bpage; - /* The commit may have missed event flags set, clear them */ - commit = local_read(&rpage->commit) & 0xfffff; - for (i = 0; i < commit && !kill_test; i += inc) { - - if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { - KILL_TEST(); - break; - } - - inc = -1; - event = (void *)&rpage->data[i]; - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - /* failed writes may be discarded events */ - if (!event->time_delta) - KILL_TEST(); - inc = event->array[0] + 4; - break; - case RINGBUF_TYPE_TIME_EXTEND: - inc = 8; - break; - case 0: - entry = ring_buffer_event_data(event); - if (*entry != cpu) { - KILL_TEST(); - break; - } - read++; - if (!event->array[0]) { - KILL_TEST(); - break; - } - inc = event->array[0] + 4; - break; - default: - entry = ring_buffer_event_data(event); - if (*entry != cpu) { - KILL_TEST(); - break; - } - read++; - inc = ((event->type_len + 1) * 4); - } - if (kill_test) - break; - - if (inc <= 0) { - KILL_TEST(); - break; - } - } - } - ring_buffer_free_read_page(buffer, bpage); - - if (ret < 0) - return EVENT_DROPPED; - return EVENT_FOUND; -} - -static void ring_buffer_consumer(void) -{ - /* toggle between reading pages and events */ - read_events ^= 1; - - read = 0; - while (!reader_finish && !kill_test) { - int found; - - do { - int cpu; - - found = 0; - for_each_online_cpu(cpu) { - enum event_status stat; - - if (read_events) - stat = read_event(cpu); - else - stat = read_page(cpu); - - if (kill_test) - break; - if (stat == EVENT_FOUND) - found = 1; - } - } while (found && !kill_test); - - set_current_state(TASK_INTERRUPTIBLE); - if (reader_finish) - break; - - schedule(); - __set_current_state(TASK_RUNNING); - } - reader_finish = 0; - complete(&read_done); -} - -static void ring_buffer_producer(void) -{ - struct timeval start_tv; - struct timeval end_tv; - unsigned long long time; - unsigned long long entries; - unsigned long long overruns; - unsigned long missed = 0; - unsigned long hit = 0; - unsigned long avg; - int cnt = 0; - - /* - * Hammer the buffer for 10 secs (this may - * make the system stall) - */ - trace_printk("Starting ring buffer hammer\n"); - do_gettimeofday(&start_tv); - do { - struct ring_buffer_event *event; - int *entry; - int i; - - for (i = 0; i < write_iteration; i++) { - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); - } - } - do_gettimeofday(&end_tv); - - cnt++; - if (consumer && !(cnt % wakeup_interval)) - wake_up_process(consumer); - -#ifndef CONFIG_PREEMPT - /* - * If we are a non preempt kernel, the 10 second run will - * stop everything while it runs. Instead, we will call - * cond_resched and also add any time that was lost by a - * rescedule. - * - * Do a cond resched at the same frequency we would wake up - * the reader. - */ - if (cnt % wakeup_interval) - cond_resched(); -#endif - - } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); - trace_printk("End ring buffer hammer\n"); - - if (consumer) { - /* Init both completions here to avoid races */ - init_completion(&read_start); - init_completion(&read_done); - /* the completions must be visible before the finish var */ - smp_wmb(); - reader_finish = 1; - /* finish var visible before waking up the consumer */ - smp_wmb(); - wake_up_process(consumer); - wait_for_completion(&read_done); - } - - time = end_tv.tv_sec - start_tv.tv_sec; - time *= USEC_PER_SEC; - time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); - - entries = ring_buffer_entries(buffer); - overruns = ring_buffer_overruns(buffer); - - if (kill_test) - trace_printk("ERROR!\n"); - - if (!disable_reader) { - if (consumer_fifo < 0) - trace_printk("Running Consumer at nice: %d\n", - consumer_nice); - else - trace_printk("Running Consumer at SCHED_FIFO %d\n", - consumer_fifo); - } - if (producer_fifo < 0) - trace_printk("Running Producer at nice: %d\n", - producer_nice); - else - trace_printk("Running Producer at SCHED_FIFO %d\n", - producer_fifo); - - /* Let the user know that the test is running at low priority */ - if (producer_fifo < 0 && consumer_fifo < 0 && - producer_nice == 19 && consumer_nice == 19) - trace_printk("WARNING!!! This test is running at lowest priority.\n"); - - trace_printk("Time: %lld (usecs)\n", time); - trace_printk("Overruns: %lld\n", overruns); - if (disable_reader) - trace_printk("Read: (reader disabled)\n"); - else - trace_printk("Read: %ld (by %s)\n", read, - read_events ? "events" : "pages"); - trace_printk("Entries: %lld\n", entries); - trace_printk("Total: %lld\n", entries + overruns + read); - trace_printk("Missed: %ld\n", missed); - trace_printk("Hit: %ld\n", hit); - - /* Convert time from usecs to millisecs */ - do_div(time, USEC_PER_MSEC); - if (time) - hit /= (long)time; - else - trace_printk("TIME IS ZERO??\n"); - - trace_printk("Entries per millisec: %ld\n", hit); - - if (hit) { - /* Calculate the average time in nanosecs */ - avg = NSEC_PER_MSEC / hit; - trace_printk("%ld ns per entry\n", avg); - } - - if (missed) { - if (time) - missed /= (long)time; - - trace_printk("Total iterations per millisec: %ld\n", - hit + missed); - - /* it is possible that hit + missed will overflow and be zero */ - if (!(hit + missed)) { - trace_printk("hit + missed overflowed and totalled zero!\n"); - hit--; /* make it non zero */ - } - - /* Caculate the average time in nanosecs */ - avg = NSEC_PER_MSEC / (hit + missed); - trace_printk("%ld ns per entry\n", avg); - } -} - -static void wait_to_die(void) -{ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); -} - -static int ring_buffer_consumer_thread(void *arg) -{ - while (!kthread_should_stop() && !kill_test) { - complete(&read_start); - - ring_buffer_consumer(); - - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop() || kill_test) - break; - - schedule(); - __set_current_state(TASK_RUNNING); - } - __set_current_state(TASK_RUNNING); - - if (kill_test) - wait_to_die(); - - return 0; -} - -static int ring_buffer_producer_thread(void *arg) -{ - init_completion(&read_start); - - while (!kthread_should_stop() && !kill_test) { - ring_buffer_reset(buffer); - - if (consumer) { - smp_wmb(); - wake_up_process(consumer); - wait_for_completion(&read_start); - } - - ring_buffer_producer(); - - trace_printk("Sleeping for 10 secs\n"); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); - } - - if (kill_test) - wait_to_die(); - - return 0; -} - -static int __init ring_buffer_benchmark_init(void) -{ - int ret; - - /* make a one meg buffer in overwite mode */ - buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); - if (!buffer) - return -ENOMEM; - - if (!disable_reader) { - consumer = kthread_create(ring_buffer_consumer_thread, - NULL, "rb_consumer"); - ret = PTR_ERR(consumer); - if (IS_ERR(consumer)) - goto out_fail; - } - - producer = kthread_run(ring_buffer_producer_thread, - NULL, "rb_producer"); - ret = PTR_ERR(producer); - - if (IS_ERR(producer)) - goto out_kill; - - /* - * Run them as low-prio background tasks by default: - */ - if (!disable_reader) { - if (consumer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(consumer, SCHED_FIFO, ¶m); - } else - set_user_nice(consumer, consumer_nice); - } - - if (producer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(producer, SCHED_FIFO, ¶m); - } else - set_user_nice(producer, producer_nice); - - return 0; - - out_kill: - if (consumer) - kthread_stop(consumer); - - out_fail: - ring_buffer_free(buffer); - return ret; -} - -static void __exit ring_buffer_benchmark_exit(void) -{ - kthread_stop(producer); - if (consumer) - kthread_stop(consumer); - ring_buffer_free(buffer); -} - -module_init(ring_buffer_benchmark_init); -module_exit(ring_buffer_benchmark_exit); - -MODULE_AUTHOR("Steven Rostedt"); -MODULE_DESCRIPTION("ring_buffer_benchmark"); -MODULE_LICENSE("GPL"); -/* - * Power trace points - * - * Copyright (C) 2009 Ming Lei - */ - -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); -EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); -EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); -EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); -/* - * ring buffer based function tracer - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Originally taken from the RT patch by: - * Arnaldo Carvalho de Melo - * - * Based on code from the latency_tracer, that is: - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -/* - * On boot up, the ring buffer is set to the minimum size, so that - * we do not waste memory on systems that are not using tracing. - */ -int ring_buffer_expanded; - -/* - * We need to change this state when a selftest is running. - * A selftest will lurk into the ring-buffer to count the - * entries inserted during the selftest although some concurrent - * insertions into the ring-buffer such as trace_printk could occurred - * at the same time, giving false positive or negative results. - */ -static bool __read_mostly tracing_selftest_running; - -/* - * If a tracer is running, we do not want to run SELFTEST. - */ -bool __read_mostly tracing_selftest_disabled; - -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } -}; - -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - -static int dummy_set_flag(u32 old_flags, u32 bit, int set) -{ - return 0; -} - -/* - * Kill all tracing for good (never come back). - * It is initialized to 1 but will turn to zero if the initialization - * of the tracer is successful. But that is the only place that sets - * this back to zero. - */ -static int tracing_disabled = 1; - -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - -static inline void ftrace_disable_cpu(void) -{ - preempt_disable(); - __this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ - __this_cpu_dec(ftrace_cpu_disabled); - preempt_enable(); -} - -cpumask_var_t __read_mostly tracing_buffer_mask; - -/* - * ftrace_dump_on_oops - variable to dump ftrace buffer on oops - * - * If there is an oops (or kernel panic) and the ftrace_dump_on_oops - * is set, then ftrace_dump is called. This will output the contents - * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a - * serial console. - * - * It is default off, but you can enable it with either specifying - * "ftrace_dump_on_oops" in the kernel command line, or setting - * /proc/sys/kernel/ftrace_dump_on_oops - * Set 1 if you want to dump buffers of all CPUs - * Set 2 if you want to dump the buffer of the CPU that triggered oops - */ - -enum ftrace_dump_mode ftrace_dump_on_oops; - -static int tracing_set_tracer(const char *buf); - -#define MAX_TRACER_SIZE 100 -static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; -static char *default_bootup_tracer; - -static int __init set_cmdline_ftrace(char *str) -{ - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); - default_bootup_tracer = bootup_tracer_buf; - /* We are using ftrace early, expand it */ - ring_buffer_expanded = 1; - return 1; -} -__setup("ftrace=", set_cmdline_ftrace); - -static int __init set_ftrace_dump_on_oops(char *str) -{ - if (*str++ != '=' || !*str) { - ftrace_dump_on_oops = DUMP_ALL; - return 1; - } - - if (!strcmp("orig_cpu", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } - - return 0; -} -__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); - -unsigned long long ns2usecs(cycle_t nsec) -{ - nsec += 500; - do_div(nsec, 1000); - return nsec; -} - -/* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. - */ -static struct trace_array global_trace; - -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); - -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) -{ - return filter_check_discard(call, rec, buffer, event); -} -EXPORT_SYMBOL_GPL(filter_current_check_discard); - -cycle_t ftrace_now(int cpu) -{ - u64 ts; - - /* Early boot up does not have a buffer yet */ - if (!global_trace.buffer) - return trace_clock_local(); - - ts = ring_buffer_time_stamp(global_trace.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); - - return ts; -} - -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - -/* tracer_enabled is used to toggle activation of a tracer */ -static int tracer_enabled = 1; - -/** - * tracing_is_enabled - return tracer_enabled status - * - * This function is used by other tracers to know the status - * of the tracer_enabled flag. Tracers may use this function - * to know if it should enable their features when starting - * up. See irqsoff tracer for an example (start_irqsoff_tracer). - */ -int tracing_is_enabled(void) -{ - return tracer_enabled; -} - -/* - * trace_buf_size is the size in bytes that is allocated - * for a buffer. Note, the number of bytes is always rounded - * to page size. - * - * This number is purposely set to a low number of 16384. - * If the dump on oops happens, it will be much appreciated - * to not have to wait for all that output. Anyway this can be - * boot time and run time configurable. - */ -#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ - -static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; - -/* trace_types holds a link list of available tracers. */ -static struct tracer *trace_types __read_mostly; - -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; - -/* - * trace_types_lock is used to protect the trace_types list. - */ -static DEFINE_MUTEX(trace_types_lock); - -/* - * serialize the access of the ring buffer - * - * ring buffer serializes readers, but it is low level protection. - * The validity of the events (which returns by ring_buffer_peek() ..etc) - * are not protected by ring buffer. - * - * The content of events may become garbage if we allow other process consumes - * these events concurrently: - * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited - * by events producer. - * B) The page of the consumed events may become a page for splice_read, - * and this page will be returned to system. - * - * These primitives allow multi process access to different cpu ring buffer - * concurrently. - * - * These primitives don't distinguish read-only and read-consume access. - * Multi read-only access are also serialized. - */ - -#ifdef CONFIG_SMP -static DECLARE_RWSEM(all_cpu_access_lock); -static DEFINE_PER_CPU(struct mutex, cpu_access_lock); - -static inline void trace_access_lock(int cpu) -{ - if (cpu == TRACE_PIPE_ALL_CPU) { - /* gain it for accessing the whole ring buffer. */ - down_write(&all_cpu_access_lock); - } else { - /* gain it for accessing a cpu ring buffer. */ - - /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ - down_read(&all_cpu_access_lock); - - /* Secondly block other access to this @cpu ring buffer. */ - mutex_lock(&per_cpu(cpu_access_lock, cpu)); - } -} - -static inline void trace_access_unlock(int cpu) -{ - if (cpu == TRACE_PIPE_ALL_CPU) { - up_write(&all_cpu_access_lock); - } else { - mutex_unlock(&per_cpu(cpu_access_lock, cpu)); - up_read(&all_cpu_access_lock); - } -} - -static inline void trace_access_lock_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - mutex_init(&per_cpu(cpu_access_lock, cpu)); -} - -#else - -static DEFINE_MUTEX(access_lock); - -static inline void trace_access_lock(int cpu) -{ - (void)cpu; - mutex_lock(&access_lock); -} - -static inline void trace_access_unlock(int cpu) -{ - (void)cpu; - mutex_unlock(&access_lock); -} - -static inline void trace_access_lock_init(void) -{ -} - -#endif - -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - -/* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -static void wakeup_work_handler(struct work_struct *work) -{ - wake_up(&trace_wait); -} - -static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -void trace_wake_up(void) -{ - const unsigned long delay = msecs_to_jiffies(2); - - if (trace_flags & TRACE_ITER_BLOCK) - return; - schedule_delayed_work(&wakeup_work, delay); -} - -static int __init set_buf_size(char *str) -{ - unsigned long buf_size; - - if (!str) - return 0; - buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; - return 1; -} -__setup("trace_buf_size=", set_buf_size); - -static int __init set_tracing_thresh(char *str) -{ - unsigned long threshhold; - int ret; - - if (!str) - return 0; - ret = strict_strtoul(str, 0, &threshhold); - if (ret < 0) - return 0; - tracing_thresh = threshhold * 1000; - return 1; -} -__setup("tracing_thresh=", set_tracing_thresh); - -unsigned long nsecs_to_usecs(unsigned long nsecs) -{ - return nsecs / 1000; -} - -/* These must match the bit postions in trace_iterator_flags */ -static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "trace_printk", - "ftrace_preempt", - "branch", - "annotate", - "userstacktrace", - "sym-userobj", - "printk-msg-only", - "context-info", - "latency-format", - "sleep-time", - "graph-time", - "record-cmd", - "overwrite", - "disable_on_free", - "irq-info", - NULL -}; - -static struct { - u64 (*func)(void); - const char *name; -} trace_clocks[] = { - { trace_clock_local, "local" }, - { trace_clock_global, "global" }, - { trace_clock_counter, "counter" }, -}; - -int trace_clock_id; - -/* - * trace_parser_get_init - gets the buffer for trace parser - */ -int trace_parser_get_init(struct trace_parser *parser, int size) -{ - memset(parser, 0, sizeof(*parser)); - - parser->buffer = kmalloc(size, GFP_KERNEL); - if (!parser->buffer) - return 1; - - parser->size = size; - return 0; -} - -/* - * trace_parser_put - frees the buffer for trace parser - */ -void trace_parser_put(struct trace_parser *parser) -{ - kfree(parser->buffer); -} - -/* - * trace_get_user - reads the user input string separated by space - * (matched by isspace(ch)) - * - * For each string found the 'struct trace_parser' is updated, - * and the function returns. - * - * Returns number of bytes read. - * - * See kernel/trace/trace.h for 'struct trace_parser' details. - */ -int trace_get_user(struct trace_parser *parser, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char ch; - size_t read = 0; - ssize_t ret; - - if (!*ppos) - trace_parser_clear(parser); - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - - read++; - cnt--; - - /* - * The parser is not finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!parser->cont) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - parser->idx = 0; - } - - /* read the non-space input */ - while (cnt && !isspace(ch)) { - if (parser->idx < parser->size - 1) - parser->buffer[parser->idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } - - /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { - parser->buffer[parser->idx] = 0; - parser->cont = false; - } else { - parser->cont = true; - parser->buffer[parser->idx++] = ch; - } - - *ppos += read; - ret = read; - -out: - return ret; -} - -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; -} - -static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) -{ - int len; - void *ret; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = memcpy(buf, s->buffer + s->readpos, cnt); - if (!ret) - return -EFAULT; - - s->readpos += cnt; - return cnt; -} - -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - -unsigned long __read_mostly tracing_thresh; - -#ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data; - - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; - - max_data = max_tr.data[cpu]; - max_data->saved_latency = tracing_max_latency; - max_data->critical_start = data->critical_start; - max_data->critical_end = data->critical_end; - - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); - max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); - max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - max_data->policy = tsk->policy; - max_data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); -} - -/** - * update_max_tr - snapshot all trace buffers from global_trace to max_tr - * @tr: tracer - * @tsk: the task with the latency - * @cpu: The cpu that initiated the trace. - * - * Flip the buffers between the @tr and the max_tr and record information - * about which task was the cause of this latency. - */ -void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct ring_buffer *buf = tr->buffer; - - if (trace_stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); - return; - } - arch_spin_lock(&ftrace_max_lock); - - tr->buffer = max_tr.buffer; - max_tr.buffer = buf; - - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); -} - -/** - * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. - * - * Flip the trace of a single CPU buffer between the @tr and the max_tr. - */ -void -update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - int ret; - - if (trace_stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); - return; - } - - arch_spin_lock(&ftrace_max_lock); - - ftrace_disable_cpu(); - - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); - - if (ret == -EBUSY) { - /* - * We failed to swap the buffer due to a commit taking - * place on this CPU. We fail to record, but we reset - * the max trace buffer (no one writes directly to it) - * and flag that it failed. - */ - trace_array_printk(&max_tr, _THIS_IP_, - "Failed to swap buffers due to commit in progress\n"); - } - - ftrace_enable_cpu(); - - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); - - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); -} -#endif /* CONFIG_TRACER_MAX_TRACE */ - -/** - * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer - * - * Register a new plugin tracer. - */ -int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock) -{ - struct tracer *t; - int ret = 0; - - if (!type->name) { - pr_info("Tracer must have a name\n"); - return -1; - } - - if (strlen(type->name) >= MAX_TRACER_SIZE) { - pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); - return -1; - } - - mutex_lock(&trace_types_lock); - - tracing_selftest_running = true; - - for (t = trace_types; t; t = t->next) { - if (strcmp(type->name, t->name) == 0) { - /* already found */ - pr_info("Tracer %s already registered\n", - type->name); - ret = -1; - goto out; - } - } - - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; - - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest && !tracing_selftest_disabled) { - struct tracer *saved_tracer = current_trace; - struct trace_array *tr = &global_trace; - - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - tracing_reset_online_cpus(tr); - - current_trace = type; - - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size); - - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - current_trace = saved_tracer; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(tr); - - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1); - - printk(KERN_CONT "PASSED\n"); - } -#endif - - type->next = trace_types; - trace_types = type; - - out: - tracing_selftest_running = false; - mutex_unlock(&trace_types_lock); - - if (ret || !default_bootup_tracer) - goto out_unlock; - - if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; - - printk(KERN_INFO "Starting tracer '%s'\n", type->name); - /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); - default_bootup_tracer = NULL; - /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; -#ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", - type->name); -#endif - - out_unlock: - return ret; -} - -void unregister_tracer(struct tracer *type) -{ - struct tracer **t; - - mutex_lock(&trace_types_lock); - for (t = &trace_types; *t; t = &(*t)->next) { - if (*t == type) - goto found; - } - pr_info("Tracer %s not registered\n", type->name); - goto out; - - found: - *t = (*t)->next; - - if (type == current_trace && tracer_enabled) { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(&global_trace); - current_trace = &nop_trace; - } -out: - mutex_unlock(&trace_types_lock); -} - -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ - ftrace_disable_cpu(); - ring_buffer_reset_cpu(buffer, cpu); - ftrace_enable_cpu(); -} - -void tracing_reset(struct trace_array *tr, int cpu) -{ - struct ring_buffer *buffer = tr->buffer; - - ring_buffer_record_disable(buffer); - - /* Make sure all commits have finished */ - synchronize_sched(); - __tracing_reset(buffer, cpu); - - ring_buffer_record_enable(buffer); -} - -void tracing_reset_online_cpus(struct trace_array *tr) -{ - struct ring_buffer *buffer = tr->buffer; - int cpu; - - ring_buffer_record_disable(buffer); - - /* Make sure all commits have finished */ - synchronize_sched(); - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - __tracing_reset(buffer, cpu); - - ring_buffer_record_enable(buffer); -} - -void tracing_reset_current(int cpu) -{ - tracing_reset(&global_trace, cpu); -} - -void tracing_reset_current_online_cpus(void) -{ - tracing_reset_online_cpus(&global_trace); -} - -#define SAVED_CMDLINES 128 -#define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; - -/* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; - -static void trace_init_cmdlines(void) -{ - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; -} - -int is_tracing_stopped(void) -{ - return trace_stop_count; -} - -/** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); -} - -/** - * tracing_start - quick start of the tracer - * - * If tracing is enabled but was stopped by tracing_stop, - * this will start the tracer back up. - */ -void tracing_start(void) -{ - struct ring_buffer *buffer; - unsigned long flags; - - if (tracing_disabled) - return; - - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) { - if (trace_stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - trace_stop_count = 0; - } - goto out; - } - - /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); - - buffer = global_trace.buffer; - if (buffer) - ring_buffer_record_enable(buffer); - - buffer = max_tr.buffer; - if (buffer) - ring_buffer_record_enable(buffer); - - arch_spin_unlock(&ftrace_max_lock); - - ftrace_start(); - out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); -} - -/** - * tracing_stop - quick stop of the tracer - * - * Light weight way to stop tracing. Use in conjunction with - * tracing_start. - */ -void tracing_stop(void) -{ - struct ring_buffer *buffer; - unsigned long flags; - - ftrace_stop(); - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (trace_stop_count++) - goto out; - - /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); - - buffer = global_trace.buffer; - if (buffer) - ring_buffer_record_disable(buffer); - - buffer = max_tr.buffer; - if (buffer) - ring_buffer_record_disable(buffer); - - arch_spin_unlock(&ftrace_max_lock); - - out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); -} - -void trace_stop_cmdline_recording(void); - -static void trace_save_cmdline(struct task_struct *tsk) -{ - unsigned pid, idx; - - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - */ - if (!arch_spin_trylock(&trace_cmdline_lock)) - return; - - idx = map_pid_to_cmdline[tsk->pid]; - if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; - - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; - - cmdline_idx = idx; - } - - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - - arch_spin_unlock(&trace_cmdline_lock); -} - -void trace_find_cmdline(int pid, char comm[]) -{ - unsigned map; - - if (!pid) { - strcpy(comm, ""); - return; - } - - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, ""); - return; - } - - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; - } - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - map = map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); - else - strcpy(comm, "<...>"); - - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -void tracing_record_cmdline(struct task_struct *tsk) -{ - if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || - !tracing_is_on()) - return; - - trace_save_cmdline(tsk); -} - -void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, - int pc) -{ - struct task_struct *tsk = current; - - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; - entry->flags = -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | -#else - TRACE_FLAG_IRQS_NOSUPPORT | -#endif - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); -} -EXPORT_SYMBOL_GPL(tracing_generic_entry_update); - -struct ring_buffer_event * -trace_buffer_lock_reserve(struct ring_buffer *buffer, - int type, - unsigned long len, - unsigned long flags, int pc) -{ - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; - } - - return event; -} - -static inline void -__trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) -{ - ring_buffer_unlock_commit(buffer, event); - - ftrace_trace_stack(buffer, flags, 6, pc); - ftrace_trace_userstack(buffer, flags, pc); - - if (wake) - trace_wake_up(); -} - -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); -} - -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); - -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); - -void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); -} -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); - -void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs) -{ - ring_buffer_unlock_commit(buffer, event); - - ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); - ftrace_trace_userstack(buffer, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); - -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - ring_buffer_discard_commit(buffer, event); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); - -void -trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->buffer; - struct ring_buffer_event *event; - struct ftrace_entry *entry; - - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->parent_ip = parent_ip; - - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); -} - -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, ip, parent_ip, flags, pc); -} - -#ifdef CONFIG_STACKTRACE - -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) -struct ftrace_stack { - unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; -}; - -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); -static DEFINE_PER_CPU(int, ftrace_stack_reserve); - -static void __ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) -{ - struct ftrace_event_call *call = &event_kernel_stack; - struct ring_buffer_event *event; - struct stack_entry *entry; - struct stack_trace trace; - int use_stack; - int size = FTRACE_STACK_ENTRIES; - - trace.nr_entries = 0; - trace.skip = skip; - - /* - * Since events can happen in NMIs there's no safe way to - * use the per cpu ftrace_stacks. We reserve it and if an interrupt - * or NMI comes in, it will just have to use the default - * FTRACE_STACK_SIZE. - */ - preempt_disable_notrace(); - - use_stack = ++__get_cpu_var(ftrace_stack_reserve); - /* - * We don't need any atomic variables, just a barrier. - * If an interrupt comes in, we don't care, because it would - * have exited and put the counter back to what we want. - * We just need a barrier to keep gcc from moving things - * around. - */ - barrier(); - if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; - trace.max_entries = FTRACE_STACK_MAX_ENTRIES; - - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - - if (trace.nr_entries > size) - size = trace.nr_entries; - } else - /* From now on, use_stack is a boolean */ - use_stack = 0; - - size *= sizeof(unsigned long); - - event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - - memset(&entry->caller, 0, size); - - if (use_stack) - memcpy(&entry->caller, trace.entries, - trace.nr_entries * sizeof(unsigned long)); - else { - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.entries = entry->caller; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - } - - entry->size = trace.nr_entries; - - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - - out: - /* Again, don't let gcc optimize things here */ - barrier(); - __get_cpu_var(ftrace_stack_reserve)--; - preempt_enable_notrace(); - -} - -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs) -{ - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(buffer, flags, skip, pc, regs); -} - -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) -{ - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); -} - -void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc) -{ - __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); -} - -/** - * trace_dump_stack - record a stack back trace in the trace buffer - */ -void trace_dump_stack(void) -{ - unsigned long flags; - - if (tracing_disabled || tracing_selftest_running) - return; - - local_save_flags(flags); - - /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); -} - -static DEFINE_PER_CPU(int, user_stack_count); - -void -ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_user_stack; - struct ring_buffer_event *event; - struct userstack_entry *entry; - struct stack_trace trace; - - if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) - return; - - /* - * NMIs can not handle page faults, even with fix ups. - * The save user stack can (and often does) fault. - */ - if (unlikely(in_nmi())) - return; - - /* - * prevent recursion, since the user stack tracing may - * trigger other kernel events. - */ - preempt_disable(); - if (__this_cpu_read(user_stack_count)) - goto out; - - __this_cpu_inc(user_stack_count); - - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); - if (!event) - goto out_drop_count; - entry = ring_buffer_event_data(event); - - entry->tgid = current->tgid; - memset(&entry->caller, 0, sizeof(entry->caller)); - - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = 0; - trace.entries = entry->caller; - - save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - - out_drop_count: - __this_cpu_dec(user_stack_count); - out: - preempt_enable(); -} - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) -{ - ftrace_trace_userstack(tr, flags, preempt_count()); -} -#endif /* UNUSED */ - -#endif /* CONFIG_STACKTRACE */ - -/** - * trace_vbprintk - write binary msg to tracing buffer - * - */ -int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) -{ - static arch_spinlock_t trace_buf_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - static u32 trace_buf[TRACE_BUF_SIZE]; - - struct ftrace_event_call *call = &event_bprint; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - struct bprint_entry *entry; - unsigned long flags; - int disable; - int cpu, len = 0, size, pc; - - if (unlikely(tracing_selftest_running || tracing_disabled)) - return 0; - - /* Don't pollute graph traces with trace_vprintk internals */ - pause_graph_tracing(); - - pc = preempt_count(); - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) - goto out; - - /* Lockdep uses trace_printk for lock tracing */ - local_irq_save(flags); - arch_spin_lock(&trace_buf_lock); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; - - size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); - } - -out_unlock: - arch_spin_unlock(&trace_buf_lock); - local_irq_restore(flags); - -out: - atomic_dec_return(&data->disabled); - preempt_enable_notrace(); - unpause_graph_tracing(); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vbprintk); - -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) -{ - static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - - struct ftrace_event_call *call = &event_print; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; - struct print_entry *entry; - unsigned long irq_flags; - int disable; - - if (tracing_disabled || tracing_selftest_running) - return 0; - - pc = preempt_count(); - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) - goto out; - - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - arch_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - size = sizeof(*entry) + len + 1; - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 6, pc); - } - - out_unlock: - arch_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); - out: - atomic_dec_return(&data->disabled); - preempt_enable_notrace(); - - return len; -} - -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) -{ - return trace_array_vprintk(&global_trace, ip, fmt, args); -} -EXPORT_SYMBOL_GPL(trace_vprintk); - -static void trace_iterator_increment(struct trace_iterator *iter) -{ - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - - iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - - ftrace_enable_cpu(); -} - -static struct trace_entry * -peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, - unsigned long *lost_events) -{ - struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - - if (buf_iter) - event = ring_buffer_iter_peek(buf_iter, ts); - else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, - lost_events); - - ftrace_enable_cpu(); - - if (event) { - iter->ent_size = ring_buffer_event_length(event); - return ring_buffer_event_data(event); - } - iter->ent_size = 0; - return NULL; -} - -static struct trace_entry * -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, - unsigned long *missing_events, u64 *ent_ts) -{ - struct ring_buffer *buffer = iter->tr->buffer; - struct trace_entry *ent, *next = NULL; - unsigned long lost_events = 0, next_lost = 0; - int cpu_file = iter->cpu_file; - u64 next_ts = 0, ts; - int next_cpu = -1; - int cpu; - - /* - * If we are in a per_cpu trace file, don't bother by iterating over - * all cpu and peek directly. - */ - if (cpu_file > TRACE_PIPE_ALL_CPU) { - if (ring_buffer_empty_cpu(buffer, cpu_file)) - return NULL; - ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); - if (ent_cpu) - *ent_cpu = cpu_file; - - return ent; - } - - for_each_tracing_cpu(cpu) { - - if (ring_buffer_empty_cpu(buffer, cpu)) - continue; - - ent = peek_next_entry(iter, cpu, &ts, &lost_events); - - /* - * Pick the entry with the smallest timestamp: - */ - if (ent && (!next || ts < next_ts)) { - next = ent; - next_cpu = cpu; - next_ts = ts; - next_lost = lost_events; - } - } - - if (ent_cpu) - *ent_cpu = next_cpu; - - if (ent_ts) - *ent_ts = next_ts; - - if (missing_events) - *missing_events = next_lost; - - return next; -} - -/* Find the next real entry, without updating the iterator itself */ -struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, - int *ent_cpu, u64 *ent_ts) -{ - return __find_next_entry(iter, ent_cpu, NULL, ent_ts); -} - -/* Find the next real entry, and increment the iterator to the next entry */ -void *trace_find_next_entry_inc(struct trace_iterator *iter) -{ - iter->ent = __find_next_entry(iter, &iter->cpu, - &iter->lost_events, &iter->ts); - - if (iter->ent) - trace_iterator_increment(iter); - - return iter->ent ? iter : NULL; -} - -static void trace_consume(struct trace_iterator *iter) -{ - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, - &iter->lost_events); - ftrace_enable_cpu(); -} - -static void *s_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_iterator *iter = m->private; - int i = (int)*pos; - void *ent; - - WARN_ON_ONCE(iter->leftover); - - (*pos)++; - - /* can't go backwards */ - if (iter->idx > i) - return NULL; - - if (iter->idx < 0) - ent = trace_find_next_entry_inc(iter); - else - ent = iter; - - while (ent && iter->idx < i) - ent = trace_find_next_entry_inc(iter); - - iter->pos = *pos; - - return ent; -} - -void tracing_iter_reset(struct trace_iterator *iter, int cpu) -{ - struct trace_array *tr = iter->tr; - struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter; - unsigned long entries = 0; - u64 ts; - - tr->data[cpu]->skipped_entries = 0; - - if (!iter->buffer_iter[cpu]) - return; - - buf_iter = iter->buffer_iter[cpu]; - ring_buffer_iter_reset(buf_iter); - - /* - * We could have the case with the max latency tracers - * that a reset never took place on a cpu. This is evident - * by the timestamp being before the start of the buffer. - */ - while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->tr->time_start) - break; - entries++; - ring_buffer_read(buf_iter, NULL); - } - - tr->data[cpu]->skipped_entries = entries; -} - -/* - * The current tracer is copied to avoid a global locking - * all around. - */ -static void *s_start(struct seq_file *m, loff_t *pos) -{ - struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; - int cpu_file = iter->cpu_file; - void *p = NULL; - loff_t l = 0; - int cpu; - - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } - mutex_unlock(&trace_types_lock); - - atomic_inc(&trace_record_cmdline_disabled); - - if (*pos != iter->pos) { - iter->ent = NULL; - iter->cpu = 0; - iter->idx = -1; - - ftrace_disable_cpu(); - - if (cpu_file == TRACE_PIPE_ALL_CPU) { - for_each_tracing_cpu(cpu) - tracing_iter_reset(iter, cpu); - } else - tracing_iter_reset(iter, cpu_file); - - ftrace_enable_cpu(); - - iter->leftover = 0; - for (p = iter; p && l < *pos; p = s_next(m, p, &l)) - ; - - } else { - /* - * If we overflowed the seq_file before, then we want - * to just reuse the trace_seq buffer again. - */ - if (iter->leftover) - p = iter; - else { - l = *pos - 1; - p = s_next(m, p, &l); - } - } - - trace_event_read_lock(); - trace_access_lock(cpu_file); - return p; -} - -static void s_stop(struct seq_file *m, void *p) -{ - struct trace_iterator *iter = m->private; - - atomic_dec(&trace_record_cmdline_disabled); - trace_access_unlock(iter->cpu_file); - trace_event_read_unlock(); -} - -static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) -{ - unsigned long count; - int cpu; - - *total = 0; - *entries = 0; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - *entries += count; - } -} - -static void print_lat_help_header(struct seq_file *m) -{ - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); -} - -static void print_event_info(struct trace_array *tr, struct seq_file *m) -{ - unsigned long total; - unsigned long entries; - - get_total_entries(tr, &total, &entries); - seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", - entries, total, num_online_cpus()); - seq_puts(m, "#\n"); -} - -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) -{ - print_event_info(tr, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); -} - -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) -{ - print_event_info(tr, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); -} - -void -print_trace_header(struct seq_file *m, struct trace_iterator *iter) -{ - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; - unsigned long entries; - unsigned long total; - const char *name = "preemption"; - - if (type) - name = type->name; - - get_total_entries(tr, &total, &entries); - - seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); - seq_puts(m, "# -----------------------------------" - "---------------------------------\n"); - seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" - " (M:%s VP:%d, KP:%d, SP:%d HP:%d", - nsecs_to_usecs(data->saved_latency), - entries, - total, - tr->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#else - "unknown", -#endif - /* These are reserved for later use */ - 0, 0, 0, 0); -#ifdef CONFIG_SMP - seq_printf(m, " #P:%d)\n", num_online_cpus()); -#else - seq_puts(m, ")\n"); -#endif - seq_puts(m, "# -----------------\n"); - seq_printf(m, "# | task: %.16s-%d " - "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, - data->policy, data->rt_priority); - seq_puts(m, "# -----------------\n"); - - if (data->critical_start) { - seq_puts(m, "# => started at: "); - seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); - trace_print_seq(m, &iter->seq); - seq_puts(m, "\n# => ended at: "); - seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); - trace_print_seq(m, &iter->seq); - seq_puts(m, "\n#\n"); - } - - seq_puts(m, "#\n"); -} - -static void test_cpu_buff_start(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - - if (!(trace_flags & TRACE_ITER_ANNOTATE)) - return; - - if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) - return; - - if (cpumask_test_cpu(iter->cpu, iter->started)) - return; - - if (iter->tr->data[iter->cpu]->skipped_entries) - return; - - cpumask_set_cpu(iter->cpu, iter->started); - - /* Don't print started cpu buffer for the first entry of the trace */ - if (iter->idx > 1) - trace_seq_printf(s, "##### CPU %u buffer started ####\n", - iter->cpu); -} - -static enum print_line_t print_trace_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *entry; - struct trace_event *event; - - entry = iter->ent; - - test_cpu_buff_start(iter); - - event = ftrace_find_event(entry->type); - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - if (!trace_print_lat_context(iter)) - goto partial; - } else { - if (!trace_print_context(iter)) - goto partial; - } - } - - if (event) - return event->funcs->trace(iter, sym_flags, event); - - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; - - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t print_raw_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry; - struct trace_event *event; - - entry = iter->ent; - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts)) - goto partial; - } - - event = ftrace_find_event(entry->type); - if (event) - return event->funcs->raw(iter, 0, event); - - if (!trace_seq_printf(s, "%d ?\n", entry->type)) - goto partial; - - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t print_hex_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - unsigned char newline = '\n'; - struct trace_entry *entry; - struct trace_event *event; - - entry = iter->ent; - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); - } - - event = ftrace_find_event(entry->type); - if (event) { - enum print_line_t ret = event->funcs->hex(iter, 0, event); - if (ret != TRACE_TYPE_HANDLED) - return ret; - } - - SEQ_PUT_FIELD_RET(s, newline); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t print_bin_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry; - struct trace_event *event; - - entry = iter->ent; - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); - } - - event = ftrace_find_event(entry->type); - return event ? event->funcs->binary(iter, 0, event) : - TRACE_TYPE_HANDLED; -} - -int trace_empty(struct trace_iterator *iter) -{ - int cpu; - - /* If we are looking at one CPU buffer, only check that one */ - if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { - cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) - return 0; - } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) - return 0; - } - return 1; - } - - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) - return 0; - } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) - return 0; - } - } - - return 1; -} - -/* Called with trace_event_read_lock() held. */ -enum print_line_t print_trace_line(struct trace_iterator *iter) -{ - enum print_line_t ret; - - if (iter->lost_events && - !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events)) - return TRACE_TYPE_PARTIAL_LINE; - - if (iter->trace && iter->trace->print_line) { - ret = iter->trace->print_line(iter); - if (ret != TRACE_TYPE_UNHANDLED) - return ret; - } - - if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return trace_print_bprintk_msg_only(iter); - - if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return trace_print_printk_msg_only(iter); - - if (trace_flags & TRACE_ITER_BIN) - return print_bin_fmt(iter); - - if (trace_flags & TRACE_ITER_HEX) - return print_hex_fmt(iter); - - if (trace_flags & TRACE_ITER_RAW) - return print_raw_fmt(iter); - - return print_trace_fmt(iter); -} - -void trace_latency_header(struct seq_file *m) -{ - struct trace_iterator *iter = m->private; - - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_trace_header(m, iter); - - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); -} - -void trace_default_header(struct seq_file *m) -{ - struct trace_iterator *iter = m->private; - - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return; - - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); - } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); - else - print_func_help_header(iter->tr, m); - } - } -} - -static void test_ftrace_alive(struct seq_file *m) -{ - if (!ftrace_is_dead()) - return; - seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); -} - -static int s_show(struct seq_file *m, void *v) -{ - struct trace_iterator *iter = v; - int ret; - - if (iter->ent == NULL) { - if (iter->tr) { - seq_printf(m, "# tracer: %s\n", iter->trace->name); - seq_puts(m, "#\n"); - test_ftrace_alive(m); - } - if (iter->trace && iter->trace->print_header) - iter->trace->print_header(m); - else - trace_default_header(m); - - } else if (iter->leftover) { - /* - * If we filled the seq_file buffer earlier, we - * want to just show it now. - */ - ret = trace_print_seq(m, &iter->seq); - - /* ret should this time be zero, but you never know */ - iter->leftover = ret; - - } else { - print_trace_line(iter); - ret = trace_print_seq(m, &iter->seq); - /* - * If we overflow the seq_file buffer, then it will - * ask us for this data again at start up. - * Use that instead. - * ret is 0 if seq_file write succeeded. - * -1 otherwise. - */ - iter->leftover = ret; - } - - return 0; -} - -static const struct seq_operations tracer_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) -{ - long cpu_file = (long) inode->i_private; - void *fail_ret = ERR_PTR(-ENOMEM); - struct trace_iterator *iter; - struct seq_file *m; - int cpu, ret; - - if (tracing_disabled) - return ERR_PTR(-ENODEV); - - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return ERR_PTR(-ENOMEM); - - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - mutex_lock(&trace_types_lock); - iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) - goto fail; - - if (current_trace) - *iter->trace = *current_trace; - - if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) - goto fail; - - if (current_trace && current_trace->print_max) - iter->tr = &max_tr; - else - iter->tr = &global_trace; - iter->pos = -1; - mutex_init(&iter->mutex); - iter->cpu_file = cpu_file; - - /* Notify the tracer early; before we stop tracing. */ - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - - /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->tr->buffer)) - iter->iter_flags |= TRACE_FILE_ANNOTATE; - - /* stop the trace while dumping */ - tracing_stop(); - - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { - for_each_tracing_cpu(cpu) { - iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); - tracing_iter_reset(iter, cpu); - } - } else { - cpu = iter->cpu_file; - iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); - tracing_iter_reset(iter, cpu); - } - - ret = seq_open(file, &tracer_seq_ops); - if (ret < 0) { - fail_ret = ERR_PTR(ret); - goto fail_buffer; - } - - m = file->private_data; - m->private = iter; - - mutex_unlock(&trace_types_lock); - - return iter; - - fail_buffer: - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - free_cpumask_var(iter->started); - tracing_start(); - fail: - mutex_unlock(&trace_types_lock); - kfree(iter->trace); - kfree(iter); - - return fail_ret; -} - -int tracing_open_generic(struct inode *inode, struct file *filp) -{ - if (tracing_disabled) - return -ENODEV; - - filp->private_data = inode->i_private; - return 0; -} - -static int tracing_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - struct trace_iterator *iter; - int cpu; - - if (!(file->f_mode & FMODE_READ)) - return 0; - - iter = m->private; - - mutex_lock(&trace_types_lock); - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - - if (iter->trace && iter->trace->close) - iter->trace->close(iter); - - /* reenable tracing if it was previously enabled */ - tracing_start(); - mutex_unlock(&trace_types_lock); - - seq_release(inode, file); - mutex_destroy(&iter->mutex); - free_cpumask_var(iter->started); - kfree(iter->trace); - kfree(iter); - return 0; -} - -static int tracing_open(struct inode *inode, struct file *file) -{ - struct trace_iterator *iter; - int ret = 0; - - /* If this file was open for write, then erase contents */ - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - long cpu = (long) inode->i_private; - - if (cpu == TRACE_PIPE_ALL_CPU) - tracing_reset_online_cpus(&global_trace); - else - tracing_reset(&global_trace, cpu); - } - - if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) - iter->iter_flags |= TRACE_FILE_LAT_FMT; - } - return ret; -} - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct tracer *t = v; - - (*pos)++; - - if (t) - t = t->next; - - return t; -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - struct tracer *t; - loff_t l = 0; - - mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; -} - -static void t_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&trace_types_lock); -} - -static int t_show(struct seq_file *m, void *v) -{ - struct tracer *t = v; - - if (!t) - return 0; - - seq_printf(m, "%s", t->name); - if (t->next) - seq_putc(m, ' '); - else - seq_putc(m, '\n'); - - return 0; -} - -static const struct seq_operations show_traces_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int show_traces_open(struct inode *inode, struct file *file) -{ - if (tracing_disabled) - return -ENODEV; - - return seq_open(file, &show_traces_seq_ops); -} - -static ssize_t -tracing_write_stub(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - return count; -} - -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) -{ - if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); - else - return 0; -} - -static const struct file_operations tracing_fops = { - .open = tracing_open, - .read = seq_read, - .write = tracing_write_stub, - .llseek = tracing_seek, - .release = tracing_release, -}; - -static const struct file_operations show_traces_fops = { - .open = show_traces_open, - .read = seq_read, - .release = seq_release, - .llseek = seq_lseek, -}; - -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - -static ssize_t -tracing_cpumask_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - int len; - - mutex_lock(&tracing_cpumask_update_lock); - - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); - if (count - len < 2) { - count = -EINVAL; - goto out_err; - } - len += sprintf(mask_str + len, "\n"); - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); - -out_err: - mutex_unlock(&tracing_cpumask_update_lock); - - return count; -} - -static ssize_t -tracing_cpumask_write(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - int err, cpu; - cpumask_var_t tracing_cpumask_new; - - if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); - if (err) - goto err_unlock; - - mutex_lock(&tracing_cpumask_update_lock); - - local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(cpu) { - /* - * Increase/decrease the disabled counter if we are - * about to flip a bit in the cpumask: - */ - if (cpumask_test_cpu(cpu, tracing_cpumask) && - !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); - } - if (!cpumask_test_cpu(cpu, tracing_cpumask) && - cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); - } - } - arch_spin_unlock(&ftrace_max_lock); - local_irq_enable(); - - cpumask_copy(tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); - free_cpumask_var(tracing_cpumask_new); - - return count; - -err_unlock: - free_cpumask_var(tracing_cpumask_new); - - return err; -} - -static const struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, - .read = tracing_cpumask_read, - .write = tracing_cpumask_write, - .llseek = generic_file_llseek, -}; - -static int tracing_trace_options_show(struct seq_file *m, void *v) -{ - struct tracer_opt *trace_opts; - u32 tracer_flags; - int i; - - mutex_lock(&trace_types_lock); - tracer_flags = current_trace->flags->val; - trace_opts = current_trace->flags->opts; - - for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) - seq_printf(m, "%s\n", trace_options[i]); - else - seq_printf(m, "no%s\n", trace_options[i]); - } - - for (i = 0; trace_opts[i].name; i++) { - if (tracer_flags & trace_opts[i].bit) - seq_printf(m, "%s\n", trace_opts[i].name); - else - seq_printf(m, "no%s\n", trace_opts[i].name); - } - mutex_unlock(&trace_types_lock); - - return 0; -} - -static int __set_tracer_option(struct tracer *trace, - struct tracer_flags *tracer_flags, - struct tracer_opt *opts, int neg) -{ - int ret; - - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); - if (ret) - return ret; - - if (neg) - tracer_flags->val &= ~opts->bit; - else - tracer_flags->val |= opts->bit; - return 0; -} - -/* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) -{ - struct tracer_flags *tracer_flags = trace->flags; - struct tracer_opt *opts = NULL; - int i; - - for (i = 0; tracer_flags->opts[i].name; i++) { - opts = &tracer_flags->opts[i]; - - if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); - } - - return -EINVAL; -} - -static void set_tracer_flags(unsigned int mask, int enabled) -{ - /* do nothing if flag is already set */ - if (!!(trace_flags & mask) == !!enabled) - return; - - if (enabled) - trace_flags |= mask; - else - trace_flags &= ~mask; - - if (mask == TRACE_ITER_RECORD_CMD) - trace_event_enable_cmd_record(enabled); - - if (mask == TRACE_ITER_OVERWRITE) - ring_buffer_change_overwrite(global_trace.buffer, enabled); -} - -static ssize_t -tracing_trace_options_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int neg = 0; - int ret; - int i; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - if (strncmp(cmp, "no", 2) == 0) { - neg = 1; - cmp += 2; - } - - for (i = 0; trace_options[i]; i++) { - if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); - break; - } - } - - /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); - ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - } - - *ppos += cnt; - - return cnt; -} - -static int tracing_trace_options_open(struct inode *inode, struct file *file) -{ - if (tracing_disabled) - return -ENODEV; - return single_open(file, tracing_trace_options_show, NULL); -} - -static const struct file_operations tracing_iter_fops = { - .open = tracing_trace_options_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = tracing_trace_options_write, -}; - -static const char readme_msg[] = - "tracing mini-HOWTO:\n\n" - "# mount -t debugfs nodev /sys/kernel/debug\n\n" - "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" - "# cat /sys/kernel/debug/tracing/trace_options\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" - "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" -; - -static ssize_t -tracing_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static const struct file_operations tracing_readme_fops = { - .open = tracing_open_generic, - .read = tracing_readme_read, - .llseek = generic_file_llseek, -}; - -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; - - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; - - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); - return -ENOMEM; - } - - buf = file_buf; - - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; - - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; - - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; - } - - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); - - kfree(file_buf); - kfree(buf_comm); - - return len; -} - -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, -}; - -static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - r = sprintf(buf, "%u\n", tracer_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - val = !!val; - - mutex_lock(&trace_types_lock); - if (tracer_enabled ^ val) { - - /* Only need to warn if this is used to change the state */ - WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); - - if (val) { - tracer_enabled = 1; - if (current_trace->start) - current_trace->start(tr); - tracing_start(); - } else { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(tr); - } - } - mutex_unlock(&trace_types_lock); - - *ppos += cnt; - - return cnt; -} - -static ssize_t -tracing_set_trace_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_TRACER_SIZE+2]; - int r; - - mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); - mutex_unlock(&trace_types_lock); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -int tracer_init(struct tracer *t, struct trace_array *tr) -{ - tracing_reset_online_cpus(tr); - return t->init(tr); -} - -static int __tracing_resize_ring_buffer(unsigned long size) -{ - int ret; - - /* - * If kernel or user changes the size of the ring buffer - * we use the size that was given, and we can forget about - * expanding it later. - */ - ring_buffer_expanded = 1; - - ret = ring_buffer_resize(global_trace.buffer, size); - if (ret < 0) - return ret; - - if (!current_trace->use_max_tr) - goto out; - - ret = ring_buffer_resize(max_tr.buffer, size); - if (ret < 0) { - int r; - - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); - if (r < 0) { - /* - * AARGH! We are left with different - * size max buffer!!!! - * The max buffer is our "snapshot" buffer. - * When a tracer needs a snapshot (one of the - * latency tracers), it swaps the max buffer - * with the saved snap shot. We succeeded to - * update the size of the main buffer, but failed to - * update the size of the max buffer. But when we tried - * to reset the main buffer to the original size, we - * failed there too. This is very unlikely to - * happen, but if it does, warn and kill all - * tracing. - */ - WARN_ON(1); - tracing_disabled = 1; - } - return ret; - } - - max_tr.entries = size; - out: - global_trace.entries = size; - - return ret; -} - -static ssize_t tracing_resize_ring_buffer(unsigned long size) -{ - int cpu, ret = size; - - mutex_lock(&trace_types_lock); - - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); - } - - if (size != global_trace.entries) - ret = __tracing_resize_ring_buffer(size); - - if (ret < 0) - ret = -ENOMEM; - - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } - - tracing_start(); - mutex_unlock(&trace_types_lock); - - return ret; -} - - -/** - * tracing_update_buffers - used by tracing facility to expand ring buffers - * - * To save on memory when the tracing is never used on a system with it - * configured in. The ring buffers are set to a minimum size. But once - * a user starts to use the tracing facility, then they need to grow - * to their default size. - * - * This function is to be called when a tracer is about to be used. - */ -int tracing_update_buffers(void) -{ - int ret = 0; - - mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size); - mutex_unlock(&trace_types_lock); - - return ret; -} - -struct trace_option_dentry; - -static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); - -static void -destroy_trace_option_files(struct trace_option_dentry *topts); - -static int tracing_set_tracer(const char *buf) -{ - static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; - struct tracer *t; - int ret = 0; - - mutex_lock(&trace_types_lock); - - if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size); - if (ret < 0) - goto out; - ret = 0; - } - - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) - break; - } - if (!t) { - ret = -EINVAL; - goto out; - } - if (t == current_trace) - goto out; - - trace_branch_disable(); - if (current_trace && current_trace->reset) - current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_resize(max_tr.buffer, 1); - max_tr.entries = 1; - } - destroy_trace_option_files(topts); - - current_trace = t; - - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { - ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); - if (ret < 0) - goto out; - max_tr.entries = global_trace.entries; - } - - if (t->init) { - ret = tracer_init(t, tr); - if (ret) - goto out; - } - - trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - - return ret; -} - -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_TRACER_SIZE+1]; - int i; - size_t ret; - int err; - - ret = cnt; - - if (cnt > MAX_TRACER_SIZE) - cnt = MAX_TRACER_SIZE; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; - - err = tracing_set_tracer(buf); - if (err) - return err; - - *ppos += ret; - - return ret; -} - -static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *ptr = filp->private_data; - char buf[64]; - int r; - - r = snprintf(buf, sizeof(buf), "%ld\n", - *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); - if (r > sizeof(buf)) - r = sizeof(buf); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *ptr = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - *ptr = val * 1000; - - return cnt; -} - -static int tracing_open_pipe(struct inode *inode, struct file *filp) -{ - long cpu_file = (long) inode->i_private; - struct trace_iterator *iter; - int ret = 0; - - if (tracing_disabled) - return -ENODEV; - - mutex_lock(&trace_types_lock); - - /* create a buffer to store the information to pass to userspace */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - ret = -ENOMEM; - goto out; - } - - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - ret = -ENOMEM; - goto fail; - } - if (current_trace) - *iter->trace = *current_trace; - - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { - ret = -ENOMEM; - goto fail; - } - - /* trace pipe does not show start of buffer */ - cpumask_setall(iter->started); - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - iter->iter_flags |= TRACE_FILE_LAT_FMT; - - iter->cpu_file = cpu_file; - iter->tr = &global_trace; - mutex_init(&iter->mutex); - filp->private_data = iter; - - if (iter->trace->pipe_open) - iter->trace->pipe_open(iter); - - nonseekable_open(inode, filp); -out: - mutex_unlock(&trace_types_lock); - return ret; - -fail: - kfree(iter->trace); - kfree(iter); - mutex_unlock(&trace_types_lock); - return ret; -} - -static int tracing_release_pipe(struct inode *inode, struct file *file) -{ - struct trace_iterator *iter = file->private_data; - - mutex_lock(&trace_types_lock); - - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - - mutex_unlock(&trace_types_lock); - - free_cpumask_var(iter->started); - mutex_destroy(&iter->mutex); - kfree(iter->trace); - kfree(iter); - - return 0; -} - -static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) -{ - struct trace_iterator *iter = filp->private_data; - - if (trace_flags & TRACE_ITER_BLOCK) { - /* - * Always select as readable when in blocking mode - */ - return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - - return 0; - } -} - - -void default_wait_pipe(struct trace_iterator *iter) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - - if (trace_empty(iter)) - schedule(); - - finish_wait(&trace_wait, &wait); -} - -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - -/* Must be called with trace_types_lock mutex held. */ -static int tracing_wait_pipe(struct file *filp) -{ - struct trace_iterator *iter = filp->private_data; - - while (trace_empty(iter)) { - - if ((filp->f_flags & O_NONBLOCK)) { - return -EAGAIN; - } - - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - - /* - * We block until we read something and tracing is disabled. - * We still block if tracing is disabled, but we have never - * read anything. This allows a user to cat this file, and - * then enable tracing. But after we have read something, - * we give an EOF when tracing is again disabled. - * - * iter->pos will be 0 if we haven't read anything. - */ - if (!tracer_enabled && iter->pos) - break; - } - - return 1; -} - -/* - * Consumer reader. - */ -static ssize_t -tracing_read_pipe(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; - ssize_t sret; - - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } - mutex_unlock(&trace_types_lock); - - /* - * Avoid more than one consumer on a single file descriptor - * This is just a matter of traces coherency, the ring buffer itself - * is protected. - */ - mutex_lock(&iter->mutex); - if (iter->trace->read) { - sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); - if (sret) - goto out; - } - -waitagain: - sret = tracing_wait_pipe(filp); - if (sret <= 0) - goto out; - - /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } - - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; - - /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); - iter->pos = -1; - - trace_event_read_lock(); - trace_access_lock(iter->cpu_file); - while (trace_find_next_entry_inc(iter) != NULL) { - enum print_line_t ret; - int len = iter->seq.len; - - ret = print_trace_line(iter); - if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ - iter->seq.len = len; - break; - } - if (ret != TRACE_TYPE_NO_CONSUME) - trace_consume(iter); - - if (iter->seq.len >= cnt) - break; - - /* - * Setting the full flag means we reached the trace_seq buffer - * size and we should leave by partial output condition above. - * One of the trace_seq_* functions is not used properly. - */ - WARN_ONCE(iter->seq.full, "full flag set for trace type %d", - iter->ent->type); - } - trace_access_unlock(iter->cpu_file); - trace_event_read_unlock(); - - /* Now copy what we have to the user */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) - trace_seq_init(&iter->seq); - - /* - * If there was nothing to send to user, in spite of consuming trace - * entries, go back to wait for more entries. - */ - if (sret == -EBUSY) - goto waitagain; - -out: - mutex_unlock(&iter->mutex); - - return sret; -} - -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - -static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, - unsigned int idx) -{ - __free_page(spd->pages[idx]); -} - -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - -static size_t -tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) -{ - size_t count; - int ret; - - /* Seq buffer is page-sized, exactly what we need. */ - for (;;) { - count = iter->seq.len; - ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; - break; - } - if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; - break; - } - - if (ret != TRACE_TYPE_NO_CONSUME) - trace_consume(iter); - rem -= count; - if (!trace_find_next_entry_inc(iter)) { - rem = 0; - iter->ent = NULL; - break; - } - } - - return rem; -} - -static ssize_t tracing_splice_read_pipe(struct file *filp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - struct page *pages_def[PIPE_DEF_BUFFERS]; - struct partial_page partial_def[PIPE_DEF_BUFFERS]; - struct trace_iterator *iter = filp->private_data; - struct splice_pipe_desc spd = { - .pages = pages_def, - .partial = partial_def, - .nr_pages = 0, /* This gets updated below. */ - .flags = flags, - .ops = &tracing_pipe_buf_ops, - .spd_release = tracing_spd_release_pipe, - }; - static struct tracer *old_tracer; - ssize_t ret; - size_t rem; - unsigned int i; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } - mutex_unlock(&trace_types_lock); - - mutex_lock(&iter->mutex); - - if (iter->trace->splice_read) { - ret = iter->trace->splice_read(iter, filp, - ppos, pipe, len, flags); - if (ret) - goto out_err; - } - - ret = tracing_wait_pipe(filp); - if (ret <= 0) - goto out_err; - - if (!iter->ent && !trace_find_next_entry_inc(iter)) { - ret = -EFAULT; - goto out_err; - } - - trace_event_read_lock(); - trace_access_lock(iter->cpu_file); - - /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { - spd.pages[i] = alloc_page(GFP_KERNEL); - if (!spd.pages[i]) - break; - - rem = tracing_fill_pipe_page(rem, iter); - - /* Copy the data into the page, so we can start over. */ - ret = trace_seq_to_buffer(&iter->seq, - page_address(spd.pages[i]), - iter->seq.len); - if (ret < 0) { - __free_page(spd.pages[i]); - break; - } - spd.partial[i].offset = 0; - spd.partial[i].len = iter->seq.len; - - trace_seq_init(&iter->seq); - } - - trace_access_unlock(iter->cpu_file); - trace_event_read_unlock(); - mutex_unlock(&iter->mutex); - - spd.nr_pages = i; - - ret = splice_to_pipe(pipe, &spd); -out: - splice_shrink_spd(pipe, &spd); - return ret; - -out_err: - mutex_unlock(&iter->mutex); - goto out; -} - -static ssize_t -tracing_entries_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - char buf[96]; - int r; - - mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - r = sprintf(buf, "%lu (expanded: %lu)\n", - tr->entries >> 10, - trace_buf_size >> 10); - else - r = sprintf(buf, "%lu\n", tr->entries >> 10); - mutex_unlock(&trace_types_lock); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_entries_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - /* must have at least 1 entry */ - if (!val) - return -EINVAL; - - /* value is in KB */ - val <<= 10; - - ret = tracing_resize_ring_buffer(val); - if (ret < 0) - return ret; - - *ppos += cnt; - - return cnt; -} - -static ssize_t -tracing_total_entries_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - char buf[64]; - int r, cpu; - unsigned long size = 0, expanded_size = 0; - - mutex_lock(&trace_types_lock); - for_each_tracing_cpu(cpu) { - size += tr->entries >> 10; - if (!ring_buffer_expanded) - expanded_size += trace_buf_size >> 10; - } - if (ring_buffer_expanded) - r = sprintf(buf, "%lu\n", size); - else - r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); - mutex_unlock(&trace_types_lock); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_free_buffer_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - /* - * There is no need to read what the user has written, this function - * is just to make sure that there is no error when "echo" is used - */ - - *ppos += cnt; - - return cnt; -} - -static int -tracing_free_buffer_release(struct inode *inode, struct file *filp) -{ - /* disable tracing ? */ - if (trace_flags & TRACE_ITER_STOP_ON_FREE) - tracing_off(); - /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0); - - return 0; -} - -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) -{ - unsigned long addr = (unsigned long)ubuf; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct print_entry *entry; - unsigned long irq_flags; - struct page *pages[2]; - int nr_pages = 1; - ssize_t written; - void *page1; - void *page2; - int offset; - int size; - int len; - int ret; - - if (tracing_disabled) - return -EINVAL; - - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. - */ - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; - - offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; - - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - written = -EFAULT; - goto out; - } - - page1 = kmap_atomic(pages[0]); - if (nr_pages == 2) - page2 = kmap_atomic(pages[1]); - - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); - if (!event) { - /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } - - entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; - - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, page1 + offset, len); - memcpy(&entry->buf[len], page2, cnt - len); - } else - memcpy(&entry->buf, page1 + offset, cnt); - - if (entry->buf[cnt - 1] != '\n') { - entry->buf[cnt] = '\n'; - entry->buf[cnt + 1] = '\0'; - } else - entry->buf[cnt] = '\0'; - - ring_buffer_unlock_commit(buffer, event); - - written = cnt; - - *fpos += written; - - out_unlock: - if (nr_pages == 2) - kunmap_atomic(page2); - kunmap_atomic(page1); - while (nr_pages > 0) - put_page(pages[--nr_pages]); - out: - return written; -} - -static int tracing_clock_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) - seq_printf(m, - "%s%s%s%s", i ? " " : "", - i == trace_clock_id ? "[" : "", trace_clocks[i].name, - i == trace_clock_id ? "]" : ""); - seq_putc(m, '\n'); - - return 0; -} - -static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) -{ - char buf[64]; - const char *clockstr; - int i; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - clockstr = strstrip(buf); - - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { - if (strcmp(trace_clocks[i].name, clockstr) == 0) - break; - } - if (i == ARRAY_SIZE(trace_clocks)) - return -EINVAL; - - trace_clock_id = i; - - mutex_lock(&trace_types_lock); - - ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); - - mutex_unlock(&trace_types_lock); - - *fpos += cnt; - - return cnt; -} - -static int tracing_clock_open(struct inode *inode, struct file *file) -{ - if (tracing_disabled) - return -ENODEV; - return single_open(file, tracing_clock_show, NULL); -} - -static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations set_tracer_fops = { - .open = tracing_open_generic, - .read = tracing_set_trace_read, - .write = tracing_set_trace_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations tracing_pipe_fops = { - .open = tracing_open_pipe, - .poll = tracing_poll_pipe, - .read = tracing_read_pipe, - .splice_read = tracing_splice_read_pipe, - .release = tracing_release_pipe, - .llseek = no_llseek, -}; - -static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, - .read = tracing_entries_read, - .write = tracing_entries_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations tracing_total_entries_fops = { - .open = tracing_open_generic, - .read = tracing_total_entries_read, - .llseek = generic_file_llseek, -}; - -static const struct file_operations tracing_free_buffer_fops = { - .write = tracing_free_buffer_write, - .release = tracing_free_buffer_release, -}; - -static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic, - .write = tracing_mark_write, - .llseek = generic_file_llseek, -}; - -static const struct file_operations trace_clock_fops = { - .open = tracing_clock_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = tracing_clock_write, -}; - -struct ftrace_buffer_info { - struct trace_array *tr; - void *spare; - int cpu; - unsigned int read; -}; - -static int tracing_buffers_open(struct inode *inode, struct file *filp) -{ - int cpu = (int)(long)inode->i_private; - struct ftrace_buffer_info *info; - - if (tracing_disabled) - return -ENODEV; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - info->tr = &global_trace; - info->cpu = cpu; - info->spare = NULL; - /* Force reading ring buffer for first read */ - info->read = (unsigned int)-1; - - filp->private_data = info; - - return nonseekable_open(inode, filp); -} - -static ssize_t -tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct ftrace_buffer_info *info = filp->private_data; - ssize_t ret; - size_t size; - - if (!count) - return 0; - - if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); - if (!info->spare) - return -ENOMEM; - - /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) - goto read; - - trace_access_lock(info->cpu); - ret = ring_buffer_read_page(info->tr->buffer, - &info->spare, - count, - info->cpu, 0); - trace_access_unlock(info->cpu); - if (ret < 0) - return 0; - - info->read = 0; - -read: - size = PAGE_SIZE - info->read; - if (size > count) - size = count; - - ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) - return -EFAULT; - size -= ret; - - *ppos += size; - info->read += size; - - return size; -} - -static int tracing_buffers_release(struct inode *inode, struct file *file) -{ - struct ftrace_buffer_info *info = file->private_data; - - if (info->spare) - ring_buffer_free_read_page(info->tr->buffer, info->spare); - kfree(info); - - return 0; -} - -struct buffer_ref { - struct ring_buffer *buffer; - void *page; - int ref; -}; - -static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct buffer_ref *ref = (struct buffer_ref *)buf->private; - - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->page); - kfree(ref); - buf->private = 0; -} - -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct buffer_ref *ref = (struct buffer_ref *)buf->private; - - ref->ref++; -} - -/* Pipe buffer operations for a buffer. */ -static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, - .get = buffer_pipe_buf_get, -}; - -/* - * Callback from splice_to_pipe(), if we need to release some pages - * at the end of the spd in case we error'ed out in filling the pipe. - */ -static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) -{ - struct buffer_ref *ref = - (struct buffer_ref *)spd->partial[i].private; - - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->page); - kfree(ref); - spd->partial[i].private = 0; -} - -static ssize_t -tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct ftrace_buffer_info *info = file->private_data; - struct partial_page partial_def[PIPE_DEF_BUFFERS]; - struct page *pages_def[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages_def, - .partial = partial_def, - .flags = flags, - .ops = &buffer_pipe_buf_ops, - .spd_release = buffer_spd_release, - }; - struct buffer_ref *ref; - int entries, size, i; - size_t ret; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); - ret = -EINVAL; - goto out; - } - - if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); - if (len < PAGE_SIZE) { - ret = -EINVAL; - goto out; - } - len &= PAGE_MASK; - } - - trace_access_lock(info->cpu); - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); - - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { - struct page *page; - int r; - - ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) - break; - - ref->ref = 1; - ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); - if (!ref->page) { - kfree(ref); - break; - } - - r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 1); - if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); - kfree(ref); - break; - } - - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - - page = virt_to_page(ref->page); - - spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; - spd.partial[i].offset = 0; - spd.partial[i].private = (unsigned long)ref; - spd.nr_pages++; - *ppos += PAGE_SIZE; - - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); - } - - trace_access_unlock(info->cpu); - spd.nr_pages = i; - - /* did we read anything? */ - if (!spd.nr_pages) { - if (flags & SPLICE_F_NONBLOCK) - ret = -EAGAIN; - else - ret = 0; - /* TODO: block */ - goto out; - } - - ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); -out: - return ret; -} - -static const struct file_operations tracing_buffers_fops = { - .open = tracing_buffers_open, - .read = tracing_buffers_read, - .release = tracing_buffers_release, - .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, -}; - -static ssize_t -tracing_stats_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - unsigned long cpu = (unsigned long)filp->private_data; - struct trace_array *tr = &global_trace; - struct trace_seq *s; - unsigned long cnt; - unsigned long long t; - unsigned long usec_rem; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - cnt = ring_buffer_entries_cpu(tr->buffer, cpu); - trace_seq_printf(s, "entries: %ld\n", cnt); - - cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); - trace_seq_printf(s, "overrun: %ld\n", cnt); - - cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); - trace_seq_printf(s, "commit overrun: %ld\n", cnt); - - cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); - trace_seq_printf(s, "bytes: %ld\n", cnt); - - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); - - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); - - kfree(s); - - return count; -} - -static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, - .read = tracing_stats_read, - .llseek = generic_file_llseek, -}; - -#ifdef CONFIG_DYNAMIC_FTRACE - -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - -static ssize_t -tracing_read_dyn_info(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); - unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); - int r; - - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); - buf[r++] = '\n'; - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - mutex_unlock(&dyn_info_mutex); - - return r; -} - -static const struct file_operations tracing_dyn_info_fops = { - .open = tracing_open_generic, - .read = tracing_read_dyn_info, - .llseek = generic_file_llseek, -}; -#endif - -static struct dentry *d_tracer; - -struct dentry *tracing_init_dentry(void) -{ - static int once; - - if (d_tracer) - return d_tracer; - - if (!debugfs_initialized()) - return NULL; - - d_tracer = debugfs_create_dir("tracing", NULL); - - if (!d_tracer && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } - - return d_tracer; -} - -static struct dentry *d_percpu; - -struct dentry *tracing_dentry_percpu(void) -{ - static int once; - struct dentry *d_tracer; - - if (d_percpu) - return d_percpu; - - d_tracer = tracing_init_dentry(); - - if (!d_tracer) - return NULL; - - d_percpu = debugfs_create_dir("per_cpu", d_tracer); - - if (!d_percpu && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'per_cpu'\n"); - return NULL; - } - - return d_percpu; -} - -static void tracing_init_debugfs_percpu(long cpu) -{ - struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *d_cpu; - char cpu_dir[30]; /* 30 characters should be more than enough */ - - snprintf(cpu_dir, 30, "cpu%ld", cpu); - d_cpu = debugfs_create_dir(cpu_dir, d_percpu); - if (!d_cpu) { - pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); - return; - } - - /* per cpu trace_pipe */ - trace_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - - /* per cpu trace */ - trace_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - - trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - - trace_create_file("stats", 0444, d_cpu, - (void *) cpu, &tracing_stats_fops); -} - -#ifdef CONFIG_FTRACE_SELFTEST -/* Let selftest have access to static functions in this file */ -#include "trace_selftest.c" -#endif - -struct trace_option_dentry { - struct tracer_opt *opt; - struct tracer_flags *flags; - struct dentry *entry; -}; - -static ssize_t -trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct trace_option_dentry *topt = filp->private_data; - char *buf; - - if (topt->flags->val & topt->opt->bit) - buf = "1\n"; - else - buf = "0\n"; - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); -} - -static ssize_t -trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct trace_option_dentry *topt = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val != 0 && val != 1) - return -EINVAL; - - if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); - ret = __set_tracer_option(current_trace, topt->flags, - topt->opt, !val); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - } - - *ppos += cnt; - - return cnt; -} - - -static const struct file_operations trace_options_fops = { - .open = tracing_open_generic, - .read = trace_options_read, - .write = trace_options_write, - .llseek = generic_file_llseek, -}; - -static ssize_t -trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - long index = (long)filp->private_data; - char *buf; - - if (trace_flags & (1 << index)) - buf = "1\n"; - else - buf = "0\n"; - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); -} - -static ssize_t -trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - long index = (long)filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val != 0 && val != 1) - return -EINVAL; - set_tracer_flags(1 << index, val); - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations trace_options_core_fops = { - .open = tracing_open_generic, - .read = trace_options_core_read, - .write = trace_options_core_write, - .llseek = generic_file_llseek, -}; - -struct dentry *trace_create_file(const char *name, - umode_t mode, - struct dentry *parent, - void *data, - const struct file_operations *fops) -{ - struct dentry *ret; - - ret = debugfs_create_file(name, mode, parent, data, fops); - if (!ret) - pr_warning("Could not create debugfs '%s' entry\n", name); - - return ret; -} - - -static struct dentry *trace_options_init_dentry(void) -{ - struct dentry *d_tracer; - static struct dentry *t_options; - - if (t_options) - return t_options; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return NULL; - - t_options = debugfs_create_dir("options", d_tracer); - if (!t_options) { - pr_warning("Could not create debugfs directory 'options'\n"); - return NULL; - } - - return t_options; -} - -static void -create_trace_option_file(struct trace_option_dentry *topt, - struct tracer_flags *flags, - struct tracer_opt *opt) -{ - struct dentry *t_options; - - t_options = trace_options_init_dentry(); - if (!t_options) - return; - - topt->flags = flags; - topt->opt = opt; - - topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); - -} - -static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) -{ - struct trace_option_dentry *topts; - struct tracer_flags *flags; - struct tracer_opt *opts; - int cnt; - - if (!tracer) - return NULL; - - flags = tracer->flags; - - if (!flags || !flags->opts) - return NULL; - - opts = flags->opts; - - for (cnt = 0; opts[cnt].name; cnt++) - ; - - topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); - if (!topts) - return NULL; - - for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(&topts[cnt], flags, - &opts[cnt]); - - return topts; -} - -static void -destroy_trace_option_files(struct trace_option_dentry *topts) -{ - int cnt; - - if (!topts) - return; - - for (cnt = 0; topts[cnt].opt; cnt++) { - if (topts[cnt].entry) - debugfs_remove(topts[cnt].entry); - } - - kfree(topts); -} - -static struct dentry * -create_trace_option_core_file(const char *option, long index) -{ - struct dentry *t_options; - - t_options = trace_options_init_dentry(); - if (!t_options) - return NULL; - - return trace_create_file(option, 0644, t_options, (void *)index, - &trace_options_core_fops); -} - -static __init void create_trace_options_dir(void) -{ - struct dentry *t_options; - int i; - - t_options = trace_options_init_dentry(); - if (!t_options) - return; - - for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(trace_options[i], i); -} - -static __init int tracer_init_debugfs(void) -{ - struct dentry *d_tracer; - int cpu; - - trace_access_lock_init(); - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - - trace_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - - trace_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - - trace_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); - -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); -#endif - - trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - - trace_create_file("README", 0444, d_tracer, - NULL, &tracing_readme_fops); - - trace_create_file("trace_pipe", 0444, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - - trace_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - - trace_create_file("buffer_total_size_kb", 0444, d_tracer, - &global_trace, &tracing_total_entries_fops); - - trace_create_file("free_buffer", 0644, d_tracer, - &global_trace, &tracing_free_buffer_fops); - - trace_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - - trace_create_file("saved_cmdlines", 0444, d_tracer, - NULL, &tracing_saved_cmdlines_fops); - - trace_create_file("trace_clock", 0644, d_tracer, NULL, - &trace_clock_fops); - -#ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); -#endif - - create_trace_options_dir(); - - for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(cpu); - - return 0; -} - -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) -{ - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; -} - -static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ -}; - -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) -{ - switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 -}; - -/* - * printk is set to max of 1024, we really don't need it that big. - * Nothing should be printing 1000 characters anyway. - */ -#define TRACE_MAX_PRINT 1000 - -/* - * Define here KERN_TRACE so that we have one place to modify - * it if we decide to change what log level the ftrace dump - * should be at. - */ -#define KERN_TRACE KERN_EMERG - -void -trace_printk_seq(struct trace_seq *s) -{ - /* Probably should print a warning here. */ - if (s->len >= 1000) - s->len = 1000; - - /* should be zero ended, but we are paranoid. */ - s->buffer[s->len] = 0; - - printk(KERN_TRACE "%s", s->buffer); - - trace_seq_init(s); -} - -void trace_init_global_iter(struct trace_iterator *iter) -{ - iter->tr = &global_trace; - iter->trace = current_trace; - iter->cpu_file = TRACE_PIPE_ALL_CPU; -} - -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) -{ - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - unsigned int old_userobj; - static int dump_ran; - unsigned long flags; - int cnt = 0, cpu; - - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; - - tracing_off(); - - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); - - trace_init_global_iter(&iter); - - for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); - } - - old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; - - /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = TRACE_PIPE_ALL_CPU; - break; - case DUMP_ORIG: - iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); - iter.cpu_file = TRACE_PIPE_ALL_CPU; - } - - printk(KERN_TRACE "Dumping ftrace buffer:\n"); - - /* - * We need to stop all tracing on all CPUS to read the - * the next buffer. This is a bit expensive, but is - * not done often. We fill all what we can read, - * and then release the locks again. - */ - - while (!trace_empty(&iter)) { - - if (!cnt) - printk(KERN_TRACE "---------------------------------\n"); - - cnt++; - - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); - iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; - - if (trace_find_next_entry_inc(&iter) != NULL) { - int ret; - - ret = print_trace_line(&iter); - if (ret != TRACE_TYPE_NO_CONSUME) - trace_consume(&iter); - } - - trace_printk_seq(&iter.seq); - } - - if (!cnt) - printk(KERN_TRACE " (ftrace buffer empty)\n"); - else - printk(KERN_TRACE "---------------------------------\n"); - - out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; - - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - tracing_on(); - } - - out: - arch_spin_unlock(&ftrace_dump_lock); - local_irq_restore(flags); -} - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ - __ftrace_dump(true, oops_dump_mode); -} -EXPORT_SYMBOL_GPL(ftrace_dump); - -__init static int tracer_alloc_buffers(void) -{ - int ring_buf_size; - enum ring_buffer_flags rb_flags; - int i; - int ret = -ENOMEM; - - - if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; - - if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) - goto out_free_buffer_mask; - - /* To save memory, keep the ring buffer size to its minimum */ - if (ring_buffer_expanded) - ring_buf_size = trace_buf_size; - else - ring_buf_size = 1; - - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - - cpumask_copy(tracing_buffer_mask, cpu_possible_mask); - cpumask_copy(tracing_cpumask, cpu_all_mask); - - /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.buffer) { - printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); - WARN_ON(1); - goto out_free_cpumask; - } - global_trace.entries = ring_buffer_size(global_trace.buffer); - - -#ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, rb_flags); - if (!max_tr.buffer) { - printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); - WARN_ON(1); - ring_buffer_free(global_trace.buffer); - goto out_free_cpumask; - } - max_tr.entries = 1; -#endif - - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_tr_data, i); - } - - trace_init_cmdlines(); - - register_tracer(&nop_trace); - current_trace = &nop_trace; - /* All seems OK, enable tracing */ - tracing_disabled = 0; - - atomic_notifier_chain_register(&panic_notifier_list, - &trace_panic_notifier); - - register_die_notifier(&trace_die_notifier); - - return 0; - -out_free_cpumask: - free_cpumask_var(tracing_cpumask); -out_free_buffer_mask: - free_cpumask_var(tracing_buffer_mask); -out: - return ret; -} - -__init static int clear_boot_tracer(void) -{ - /* - * The default tracer at boot buffer is an init section. - * This function is called in lateinit. If we did not - * find the boot tracer, then clear it out, to prevent - * later registration from accessing the buffer that is - * about to be freed. - */ - if (!default_bootup_tracer) - return 0; - - printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", - default_bootup_tracer); - default_bootup_tracer = NULL; - - return 0; -} - -early_initcall(tracer_alloc_buffers); -fs_initcall(tracer_init_debugfs); -late_initcall(clear_boot_tracer); -/* - * unlikely profiler - * - * Copyright (C) 2008 Steven Rostedt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_stat.h" -#include "trace_output.h" - -#ifdef CONFIG_BRANCH_TRACER - -static struct tracer branch_trace; -static int branch_tracing_enabled __read_mostly; -static DEFINE_MUTEX(branch_tracing_mutex); - -static struct trace_array *branch_tracer; - -static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ - struct ftrace_event_call *call = &event_branch; - struct trace_array *tr = branch_tracer; - struct ring_buffer_event *event; - struct trace_branch *entry; - struct ring_buffer *buffer; - unsigned long flags; - int cpu, pc; - const char *p; - - /* - * I would love to save just the ftrace_likely_data pointer, but - * this code can also be used by modules. Ugly things can happen - * if the module is unloaded, and then we go and read the - * pointer. This is slower, but much safer. - */ - - if (unlikely(!tr)) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - pc = preempt_count(); - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, - sizeof(*entry), flags, pc); - if (!event) - goto out; - - entry = ring_buffer_event_data(event); - - /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') - p--; - p++; - - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; - entry->correct = val == expect; - - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(flags); -} - -static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ - if (!branch_tracing_enabled) - return; - - probe_likely_condition(f, val, expect); -} - -int enable_branch_tracing(struct trace_array *tr) -{ - mutex_lock(&branch_tracing_mutex); - branch_tracer = tr; - /* - * Must be seen before enabling. The reader is a condition - * where we do not need a matching rmb() - */ - smp_wmb(); - branch_tracing_enabled++; - mutex_unlock(&branch_tracing_mutex); - - return 0; -} - -void disable_branch_tracing(void) -{ - mutex_lock(&branch_tracing_mutex); - - if (!branch_tracing_enabled) - goto out_unlock; - - branch_tracing_enabled--; - - out_unlock: - mutex_unlock(&branch_tracing_mutex); -} - -static void start_branch_trace(struct trace_array *tr) -{ - enable_branch_tracing(tr); -} - -static void stop_branch_trace(struct trace_array *tr) -{ - disable_branch_tracing(); -} - -static int branch_trace_init(struct trace_array *tr) -{ - start_branch_trace(tr); - return 0; -} - -static void branch_trace_reset(struct trace_array *tr) -{ - stop_branch_trace(tr); -} - -static enum print_line_t trace_branch_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct trace_branch *field; - - trace_assign_type(field, iter->ent); - - if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static void branch_print_header(struct seq_file *s) -{ - seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" - " FUNC:FILE:LINE\n"); - seq_puts(s, "# | | | | | " - " |\n"); -} - -static struct trace_event_functions trace_branch_funcs = { - .trace = trace_branch_print, -}; - -static struct trace_event trace_branch_event = { - .type = TRACE_BRANCH, - .funcs = &trace_branch_funcs, -}; - -static struct tracer branch_trace __read_mostly = -{ - .name = "branch", - .init = branch_trace_init, - .reset = branch_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_branch, -#endif /* CONFIG_FTRACE_SELFTEST */ - .print_header = branch_print_header, -}; - -__init static int init_branch_tracer(void) -{ - int ret; - - ret = register_ftrace_event(&trace_branch_event); - if (!ret) { - printk(KERN_WARNING "Warning: could not register " - "branch events\n"); - return 1; - } - return register_tracer(&branch_trace); -} -device_initcall(init_branch_tracer); - -#else -static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ -} -#endif /* CONFIG_BRANCH_TRACER */ - -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) -{ - /* - * I would love to have a trace point here instead, but the - * trace point code is so inundated with unlikely and likely - * conditions that the recursive nightmare that exists is too - * much to try to get working. At least for now. - */ - trace_likely_condition(f, val, expect); - - /* FIXME: Make this atomic! */ - if (val == expect) - f->correct++; - else - f->incorrect++; -} -EXPORT_SYMBOL(ftrace_likely_update); - -extern unsigned long __start_annotated_branch_profile[]; -extern unsigned long __stop_annotated_branch_profile[]; - -static int annotated_branch_stat_headers(struct seq_file *m) -{ - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); - return 0; -} - -static inline long get_incorrect_percent(struct ftrace_branch_data *p) -{ - long percent; - - if (p->correct) { - percent = p->incorrect * 100; - percent /= p->correct + p->incorrect; - } else - percent = p->incorrect ? 100 : -1; - - return percent; -} - -static int branch_stat_show(struct seq_file *m, void *v) -{ - struct ftrace_branch_data *p = v; - const char *f; - long percent; - - /* Only print the file, not the path */ - f = p->file + strlen(p->file); - while (f >= p->file && *f != '/') - f--; - f++; - - /* - * The miss is overlayed on correct, and hit on incorrect. - */ - percent = get_incorrect_percent(p); - - seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); - if (percent < 0) - seq_printf(m, " X "); - else - seq_printf(m, "%3ld ", percent); - seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); - return 0; -} - -static void *annotated_branch_stat_start(struct tracer_stat *trace) -{ - return __start_annotated_branch_profile; -} - -static void * -annotated_branch_stat_next(void *v, int idx) -{ - struct ftrace_branch_data *p = v; - - ++p; - - if ((void *)p >= (void *)__stop_annotated_branch_profile) - return NULL; - - return p; -} - -static int annotated_branch_stat_cmp(void *p1, void *p2) -{ - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; - - long percent_a, percent_b; - - percent_a = get_incorrect_percent(a); - percent_b = get_incorrect_percent(b); - - if (percent_a < percent_b) - return -1; - if (percent_a > percent_b) - return 1; - - if (a->incorrect < b->incorrect) - return -1; - if (a->incorrect > b->incorrect) - return 1; - - /* - * Since the above shows worse (incorrect) cases - * first, we continue that by showing best (correct) - * cases last. - */ - if (a->correct > b->correct) - return -1; - if (a->correct < b->correct) - return 1; - - return 0; -} - -static struct tracer_stat annotated_branch_stats = { - .name = "branch_annotated", - .stat_start = annotated_branch_stat_start, - .stat_next = annotated_branch_stat_next, - .stat_cmp = annotated_branch_stat_cmp, - .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show -}; - -__init static int init_annotated_branch_stats(void) -{ - int ret; - - ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { - printk(KERN_WARNING "Warning: could not register " - "annotated branches stats\n"); - return 1; - } - return 0; -} -fs_initcall(init_annotated_branch_stats); - -#ifdef CONFIG_PROFILE_ALL_BRANCHES - -extern unsigned long __start_branch_profile[]; -extern unsigned long __stop_branch_profile[]; - -static int all_branch_stat_headers(struct seq_file *m) -{ - seq_printf(m, " miss hit %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); - return 0; -} - -static void *all_branch_stat_start(struct tracer_stat *trace) -{ - return __start_branch_profile; -} - -static void * -all_branch_stat_next(void *v, int idx) -{ - struct ftrace_branch_data *p = v; - - ++p; - - if ((void *)p >= (void *)__stop_branch_profile) - return NULL; - - return p; -} - -static struct tracer_stat all_branch_stats = { - .name = "branch_all", - .stat_start = all_branch_stat_start, - .stat_next = all_branch_stat_next, - .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show -}; - -__init static int all_annotated_branch_stats(void) -{ - int ret; - - ret = register_stat_tracer(&all_branch_stats); - if (!ret) { - printk(KERN_WARNING "Warning: could not register " - "all branches stats\n"); - return 1; - } - return 0; -} -fs_initcall(all_annotated_branch_stats); -#endif /* CONFIG_PROFILE_ALL_BRANCHES */ -/* - * tracing clocks - * - * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar - * - * Implements 3 trace clock variants, with differing scalability/precision - * tradeoffs: - * - * - local: CPU-local trace clock - * - medium: scalable global clock with some jitter - * - global: globally monotonic, serialized clock - * - * Tracer plugins will chose a default from these clocks. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -/* - * trace_clock_local(): the simplest and least coherent tracing clock. - * - * Useful for tracing that does not cross to other CPUs nor - * does it go through idle events. - */ -u64 notrace trace_clock_local(void) -{ - u64 clock; - - /* - * sched_clock() is an architecture implemented, fast, scalable, - * lockless clock. It is not guaranteed to be coherent across - * CPUs, nor across CPU idle events. - */ - preempt_disable_notrace(); - clock = sched_clock(); - preempt_enable_notrace(); - - return clock; -} - -/* - * trace_clock(): 'between' trace clock. Not completely serialized, - * but not completely incorrect when crossing CPUs either. - * - * This is based on cpu_clock(), which will allow at most ~1 jiffy of - * jitter between CPUs. So it's a pretty scalable clock, but there - * can be offsets in the trace data. - */ -u64 notrace trace_clock(void) -{ - return local_clock(); -} - - -/* - * trace_clock_global(): special globally coherent trace clock - * - * It has higher overhead than the other trace clocks but is still - * an order of magnitude faster than GTOD derived hardware clocks. - * - * Used by plugins that need globally coherent timestamps. - */ - -/* keep prev_time and lock in the same cacheline. */ -static struct { - u64 prev_time; - arch_spinlock_t lock; -} trace_clock_struct ____cacheline_aligned_in_smp = - { - .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, - }; - -u64 notrace trace_clock_global(void) -{ - unsigned long flags; - int this_cpu; - u64 now; - - local_irq_save(flags); - - this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); - /* - * If in an NMI context then dont risk lockups and return the - * cpu_clock() time: - */ - if (unlikely(in_nmi())) - goto out; - - arch_spin_lock(&trace_clock_struct.lock); - - /* - * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_time+1, to make sure - * we start ticking with the local clock from now on? - */ - if ((s64)(now - trace_clock_struct.prev_time) < 0) - now = trace_clock_struct.prev_time + 1; - - trace_clock_struct.prev_time = now; - - arch_spin_unlock(&trace_clock_struct.lock); - - out: - local_irq_restore(flags); - - return now; -} - -static atomic64_t trace_counter; - -/* - * trace_clock_counter(): simply an atomic counter. - * Use the trace_counter "counter" for cases where you do not care - * about timings, but are interested in strict ordering. - */ -u64 notrace trace_clock_counter(void) -{ - return atomic64_add_return(1, &trace_counter); -} -/* - * trace event based perf event profiling/tracing - * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra - * Copyright (C) 2009-2010 Frederic Weisbecker - */ - -#include -#include -#include "trace.h" - -static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; - -/* - * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises - */ -typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) - perf_trace_t; - -/* Count the events in use (per event id, not per instance) */ -static int total_ref_count; - -static int perf_trace_event_perm(struct ftrace_event_call *tp_event, - struct perf_event *p_event) -{ - /* No tracing, just counting, so no obvious leak */ - if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) - return 0; - - /* Some events are ok to be traced by non-root users... */ - if (p_event->attach_state == PERF_ATTACH_TASK) { - if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) - return 0; - } - - /* - * ...otherwise raw tracepoint data can be a severe data leak, - * only allow root to have these. - */ - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return 0; -} - -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) -{ - struct hlist_head __percpu *list; - int ret; - int cpu; - - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - - p_event->tp_event = tp_event; - if (tp_event->perf_refcount++ > 0) - return 0; - - ret = -ENOMEM; - - list = alloc_percpu(struct hlist_head); - if (!list) - goto fail; - - for_each_possible_cpu(cpu) - INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); - - tp_event->perf_events = list; - - if (!total_ref_count) { - char __percpu *buf; - int i; - - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - buf = (char __percpu *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail; - - perf_trace_buf[i] = buf; - } - } - - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); - if (ret) - goto fail; - - total_ref_count++; - return 0; - -fail: - if (!total_ref_count) { - int i; - - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } - - if (!--tp_event->perf_refcount) { - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - } - - return ret; -} - -int perf_trace_init(struct perf_event *p_event) -{ - struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; - int ret = -EINVAL; - - mutex_lock(&event_mutex); - list_for_each_entry(tp_event, &ftrace_events, list) { - if (tp_event->event.type == event_id && - tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { - ret = perf_trace_event_init(tp_event, p_event); - if (ret) - module_put(tp_event->mod); - break; - } - } - mutex_unlock(&event_mutex); - - return ret; -} - -int perf_trace_add(struct perf_event *p_event, int flags) -{ - struct ftrace_event_call *tp_event = p_event->tp_event; - struct hlist_head __percpu *pcpu_list; - struct hlist_head *list; - - pcpu_list = tp_event->perf_events; - if (WARN_ON_ONCE(!pcpu_list)) - return -EINVAL; - - if (!(flags & PERF_EF_START)) - p_event->hw.state = PERF_HES_STOPPED; - - list = this_cpu_ptr(pcpu_list); - hlist_add_head_rcu(&p_event->hlist_entry, list); - - return 0; -} - -void perf_trace_del(struct perf_event *p_event, int flags) -{ - hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{ - struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); -} - -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) -{ - struct trace_entry *entry; - unsigned long flags; - char *raw_data; - int pc; - - BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); - - pc = preempt_count(); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) - return NULL; - - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); - - /* zero the dead bytes from align to not leak stack to user */ - memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); - - entry = (struct trace_entry *)raw_data; - local_save_flags(flags); - tracing_generic_entry_update(entry, flags, pc); - entry->type = type; - - return raw_data; -} -EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -/* - * event tracer - * - * Copyright (C) 2008 Red Hat Inc, Steven Rostedt - * - * - Added format output of fields of the trace point. - * This was based off of work by Tom Zanussi . - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM "TRACE_SYSTEM" - -DEFINE_MUTEX(event_mutex); - -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - -LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); - -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call) -{ - if (!event_call->class->get_fields) - return &event_call->class->fields; - return event_call->class->get_fields(event_call); -} - -static int __trace_define_field(struct list_head *head, const char *type, - const char *name, int offset, int size, - int is_signed, int filter_type) -{ - struct ftrace_event_field *field; - - field = kzalloc(sizeof(*field), GFP_KERNEL); - if (!field) - goto err; - - field->name = kstrdup(name, GFP_KERNEL); - if (!field->name) - goto err; - - field->type = kstrdup(type, GFP_KERNEL); - if (!field->type) - goto err; - - if (filter_type == FILTER_OTHER) - field->filter_type = filter_assign_type(type); - else - field->filter_type = filter_type; - - field->offset = offset; - field->size = size; - field->is_signed = is_signed; - - list_add(&field->link, head); - - return 0; - -err: - if (field) - kfree(field->name); - kfree(field); - - return -ENOMEM; -} - -int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed, - int filter_type) -{ - struct list_head *head; - - if (WARN_ON(!call->class)) - return 0; - - head = trace_get_fields(call); - return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type); -} -EXPORT_SYMBOL_GPL(trace_define_field); - -#define __common_field(type, item) \ - ret = __trace_define_field(&ftrace_common_fields, #type, \ - "common_" #item, \ - offsetof(typeof(ent), item), \ - sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; - -static int trace_define_common_fields(void) -{ - int ret; - struct trace_entry ent; - - __common_field(unsigned short, type); - __common_field(unsigned char, flags); - __common_field(unsigned char, preempt_count); - __common_field(int, pid); - __common_field(int, padding); - - return ret; -} - -void trace_destroy_fields(struct ftrace_event_call *call) -{ - struct ftrace_event_field *field, *next; - struct list_head *head; - - head = trace_get_fields(call); - list_for_each_entry_safe(field, next, head, link) { - list_del(&field->link); - kfree(field->type); - kfree(field->name); - kfree(field); - } -} - -int trace_event_raw_init(struct ftrace_event_call *call) -{ - int id; - - id = register_ftrace_event(&call->event); - if (!id) - return -ENODEV; - - return 0; -} -EXPORT_SYMBOL_GPL(trace_event_raw_init); - -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) -{ - switch (type) { - case TRACE_REG_REGISTER: - return tracepoint_probe_register(call->name, - call->class->probe, - call); - case TRACE_REG_UNREGISTER: - tracepoint_probe_unregister(call->name, - call->class->probe, - call); - return 0; - -#ifdef CONFIG_PERF_EVENTS - case TRACE_REG_PERF_REGISTER: - return tracepoint_probe_register(call->name, - call->class->perf_probe, - call); - case TRACE_REG_PERF_UNREGISTER: - tracepoint_probe_unregister(call->name, - call->class->perf_probe, - call); - return 0; -#endif - } - return 0; -} -EXPORT_SYMBOL_GPL(ftrace_event_reg); - -void trace_event_enable_cmd_record(bool enable) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) - continue; - - if (enable) { - tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; - } else { - tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; - } - } - mutex_unlock(&event_mutex); -} - -static int ftrace_event_enable_disable(struct ftrace_event_call *call, - int enable) -{ - int ret = 0; - - switch (enable) { - case 0: - if (call->flags & TRACE_EVENT_FL_ENABLED) { - call->flags &= ~TRACE_EVENT_FL_ENABLED; - if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { - tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; - } - call->class->reg(call, TRACE_REG_UNREGISTER); - } - break; - case 1: - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { - if (trace_flags & TRACE_ITER_RECORD_CMD) { - tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; - } - ret = call->class->reg(call, TRACE_REG_REGISTER); - if (ret) { - tracing_stop_cmdline_record(); - pr_info("event trace: Could not enable event " - "%s\n", call->name); - break; - } - call->flags |= TRACE_EVENT_FL_ENABLED; - } - break; - } - - return ret; -} - -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - ftrace_event_enable_disable(call, 0); - } - mutex_unlock(&event_mutex); -} - -static void __put_system(struct event_subsystem *system) -{ - struct event_filter *filter = system->filter; - - WARN_ON_ONCE(system->ref_count == 0); - if (--system->ref_count) - return; - - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); -} - -static void __get_system(struct event_subsystem *system) -{ - WARN_ON_ONCE(system->ref_count == 0); - system->ref_count++; -} - -static void put_system(struct event_subsystem *system) -{ - mutex_lock(&event_mutex); - __put_system(system); - mutex_unlock(&event_mutex); -} - -/* - * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. - */ -static int __ftrace_set_clr_event(const char *match, const char *sub, - const char *event, int set) -{ - struct ftrace_event_call *call; - int ret = -EINVAL; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->name || !call->class || !call->class->reg) - continue; - - if (match && - strcmp(match, call->name) != 0 && - strcmp(match, call->class->system) != 0) - continue; - - if (sub && strcmp(sub, call->class->system) != 0) - continue; - - if (event && strcmp(event, call->name) != 0) - continue; - - ftrace_event_enable_disable(call, set); - - ret = 0; - } - mutex_unlock(&event_mutex); - - return ret; -} - -static int ftrace_set_clr_event(char *buf, int set) -{ - char *event = NULL, *sub = NULL, *match; - - /* - * The buf format can be : - * *: means any event by that name. - * : is the same. - * - * :* means all events in that subsystem - * : means the same. - * - * (no ':') means all events in a subsystem with - * the name or any event that matches - */ - - match = strsep(&buf, ":"); - if (buf) { - sub = match; - event = buf; - match = NULL; - - if (!strlen(sub) || strcmp(sub, "*") == 0) - sub = NULL; - if (!strlen(event) || strcmp(event, "*") == 0) - event = NULL; - } - - return __ftrace_set_clr_event(match, sub, event, set); -} - -/** - * trace_set_clr_event - enable or disable an event - * @system: system name to match (NULL for any system) - * @event: event name to match (NULL for all events, within system) - * @set: 1 to enable, 0 to disable - * - * This is a way for other parts of the kernel to enable or disable - * event recording. - * - * Returns 0 on success, -EINVAL if the parameters do not match any - * registered events. - */ -int trace_set_clr_event(const char *system, const char *event, int set) -{ - return __ftrace_set_clr_event(NULL, system, event, set); -} -EXPORT_SYMBOL_GPL(trace_set_clr_event); - -/* 128 should be much more than enough */ -#define EVENT_BUF_SIZE 127 - -static ssize_t -ftrace_event_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_parser parser; - ssize_t read, ret; - - if (!cnt) - return 0; - - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - - read = trace_get_user(&parser, ubuf, cnt, ppos); - - if (read >= 0 && trace_parser_loaded((&parser))) { - int set = 1; - - if (*parser.buffer == '!') - set = 0; - - parser.buffer[parser.idx] = 0; - - ret = ftrace_set_clr_event(parser.buffer + !set, set); - if (ret) - goto out_put; - } - - ret = read; - - out_put: - trace_parser_put(&parser); - - return ret; -} - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_event_call *call = v; - - (*pos)++; - - list_for_each_entry_continue(call, &ftrace_events, list) { - /* - * The ftrace subsystem is for showing formats only. - * They can not be enabled or disabled via the event files. - */ - if (call->class && call->class->reg) - return call; - } - - return NULL; -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - struct ftrace_event_call *call; - loff_t l; - - mutex_lock(&event_mutex); - - call = list_entry(&ftrace_events, struct ftrace_event_call, list); - for (l = 0; l <= *pos; ) { - call = t_next(m, call, &l); - if (!call) - break; - } - return call; -} - -static void * -s_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_event_call *call = v; - - (*pos)++; - - list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->flags & TRACE_EVENT_FL_ENABLED) - return call; - } - - return NULL; -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - struct ftrace_event_call *call; - loff_t l; - - mutex_lock(&event_mutex); - - call = list_entry(&ftrace_events, struct ftrace_event_call, list); - for (l = 0; l <= *pos; ) { - call = s_next(m, call, &l); - if (!call) - break; - } - return call; -} - -static int t_show(struct seq_file *m, void *v) -{ - struct ftrace_event_call *call = v; - - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", call->name); - - return 0; -} - -static void t_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&event_mutex); -} - -static int -ftrace_event_seq_open(struct inode *inode, struct file *file) -{ - const struct seq_operations *seq_ops; - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_clear_events(); - - seq_ops = inode->i_private; - return seq_open(file, seq_ops); -} - -static ssize_t -event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - char *buf; - - if (call->flags & TRACE_EVENT_FL_ENABLED) - buf = "1\n"; - else - buf = "0\n"; - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); -} - -static ssize_t -event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - - switch (val) { - case 0: - case 1: - mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(call, val); - mutex_unlock(&event_mutex); - break; - - default: - return -EINVAL; - } - - *ppos += cnt; - - return ret ? ret : cnt; -} - -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct event_subsystem *system = filp->private_data; - struct ftrace_event_call *call; - char buf[2]; - int set = 0; - int ret; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || !call->class->reg) - continue; - - if (system && strcmp(call->class->system, system->name) != 0) - continue; - - /* - * We need to find out if all the events are set - * or if all events or cleared, or if we have - * a mixture. - */ - set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); - - /* - * If we have a mixture, no need to look further. - */ - if (set == 3) - break; - } - mutex_unlock(&event_mutex); - - buf[0] = set_to_char[set]; - buf[1] = '\n'; - - ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); - - return ret; -} - -static ssize_t -system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct event_subsystem *system = filp->private_data; - const char *name = NULL; - unsigned long val; - ssize_t ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - - if (val != 0 && val != 1) - return -EINVAL; - - /* - * Opening of "enable" adds a ref count to system, - * so the name is safe to use. - */ - if (system) - name = system->name; - - ret = __ftrace_set_clr_event(NULL, name, NULL, val); - if (ret) - goto out; - - ret = cnt; - -out: - *ppos += cnt; - - return ret; -} - -enum { - FORMAT_HEADER = 1, - FORMAT_FIELD_SEPERATOR = 2, - FORMAT_PRINTFMT = 3, -}; - -static void *f_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; - struct list_head *common_head = &ftrace_common_fields; - struct list_head *head = trace_get_fields(call); - - (*pos)++; - - switch ((unsigned long)v) { - case FORMAT_HEADER: - if (unlikely(list_empty(common_head))) - return NULL; - - field = list_entry(common_head->prev, - struct ftrace_event_field, link); - return field; - - case FORMAT_FIELD_SEPERATOR: - if (unlikely(list_empty(head))) - return NULL; - - field = list_entry(head->prev, struct ftrace_event_field, link); - return field; - - case FORMAT_PRINTFMT: - /* all done */ - return NULL; - } - - field = v; - if (field->link.prev == common_head) - return (void *)FORMAT_FIELD_SEPERATOR; - else if (field->link.prev == head) - return (void *)FORMAT_PRINTFMT; - - field = list_entry(field->link.prev, struct ftrace_event_field, link); - - return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ - loff_t l = 0; - void *p; - - /* Start by showing the header */ - if (!*pos) - return (void *)FORMAT_HEADER; - - p = (void *)FORMAT_HEADER; - do { - p = f_next(m, p, &l); - } while (p && l < *pos); - - return p; -} - -static int f_show(struct seq_file *m, void *v) -{ - struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; - const char *array_descriptor; - - switch ((unsigned long)v) { - case FORMAT_HEADER: - seq_printf(m, "name: %s\n", call->name); - seq_printf(m, "ID: %d\n", call->event.type); - seq_printf(m, "format:\n"); - return 0; - - case FORMAT_FIELD_SEPERATOR: - seq_putc(m, '\n'); - return 0; - - case FORMAT_PRINTFMT: - seq_printf(m, "\nprint fmt: %s\n", - call->print_fmt); - return 0; - } - - field = v; - - /* - * Smartly shows the array type(except dynamic array). - * Normal: - * field:TYPE VAR - * If TYPE := TYPE[LEN], it is shown: - * field:TYPE VAR[LEN] - */ - array_descriptor = strchr(field->type, '['); - - if (!strncmp(field->type, "__data_loc", 10)) - array_descriptor = NULL; - - if (!array_descriptor) - seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", - field->type, field->name, field->offset, - field->size, !!field->is_signed); - else - seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", - (int)(array_descriptor - field->type), - field->type, field->name, - array_descriptor, field->offset, - field->size, !!field->is_signed); - - return 0; -} - -static void f_stop(struct seq_file *m, void *p) -{ -} - -static const struct seq_operations trace_format_seq_ops = { - .start = f_start, - .next = f_next, - .stop = f_stop, - .show = f_show, -}; - -static int trace_format_open(struct inode *inode, struct file *file) -{ - struct ftrace_event_call *call = inode->i_private; - struct seq_file *m; - int ret; - - ret = seq_open(file, &trace_format_seq_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = call; - - return 0; -} - -static ssize_t -event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->event.type); - - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - kfree(s); - return r; -} - -static ssize_t -event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - print_event_filter(call, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); - - kfree(s); - - return r; -} - -static ssize_t -event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - char *buf; - int err; - - if (cnt >= PAGE_SIZE) - return -EINVAL; - - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; - - err = apply_event_filter(call, buf); - free_page((unsigned long) buf); - if (err < 0) - return err; - - *ppos += cnt; - - return cnt; -} - -static LIST_HEAD(event_subsystems); - -static int subsystem_open(struct inode *inode, struct file *filp) -{ - struct event_subsystem *system = NULL; - int ret; - - if (!inode->i_private) - goto skip_search; - - /* Make sure the system still exists */ - mutex_lock(&event_mutex); - list_for_each_entry(system, &event_subsystems, list) { - if (system == inode->i_private) { - /* Don't open systems with no events */ - if (!system->nr_events) { - system = NULL; - break; - } - __get_system(system); - break; - } - } - mutex_unlock(&event_mutex); - - if (system != inode->i_private) - return -ENODEV; - - skip_search: - ret = tracing_open_generic(inode, filp); - if (ret < 0 && system) - put_system(system); - - return ret; -} - -static int subsystem_release(struct inode *inode, struct file *file) -{ - struct event_subsystem *system = inode->i_private; - - if (system) - put_system(system); - - return 0; -} - -static ssize_t -subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct event_subsystem *system = filp->private_data; - struct trace_seq *s; - int r; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - print_subsystem_event_filter(system, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); - - kfree(s); - - return r; -} - -static ssize_t -subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct event_subsystem *system = filp->private_data; - char *buf; - int err; - - if (cnt >= PAGE_SIZE) - return -EINVAL; - - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; - - err = apply_subsystem_event_filter(system, buf); - free_page((unsigned long) buf); - if (err < 0) - return err; - - *ppos += cnt; - - return cnt; -} - -static ssize_t -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) -{ - int (*func)(struct trace_seq *s) = filp->private_data; - struct trace_seq *s; - int r; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - func(s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); - - kfree(s); - - return r; -} - -static const struct seq_operations show_event_seq_ops = { - .start = t_start, - .next = t_next, - .show = t_show, - .stop = t_stop, -}; - -static const struct seq_operations show_set_event_seq_ops = { - .start = s_start, - .next = s_next, - .show = t_show, - .stop = t_stop, -}; - -static const struct file_operations ftrace_avail_fops = { - .open = ftrace_event_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations ftrace_set_event_fops = { - .open = ftrace_event_seq_open, - .read = seq_read, - .write = ftrace_event_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, - .read = event_enable_read, - .write = event_enable_write, - .llseek = default_llseek, -}; - -static const struct file_operations ftrace_event_format_fops = { - .open = trace_format_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations ftrace_event_id_fops = { - .open = tracing_open_generic, - .read = event_id_read, - .llseek = default_llseek, -}; - -static const struct file_operations ftrace_event_filter_fops = { - .open = tracing_open_generic, - .read = event_filter_read, - .write = event_filter_write, - .llseek = default_llseek, -}; - -static const struct file_operations ftrace_subsystem_filter_fops = { - .open = subsystem_open, - .read = subsystem_filter_read, - .write = subsystem_filter_write, - .llseek = default_llseek, - .release = subsystem_release, -}; - -static const struct file_operations ftrace_system_enable_fops = { - .open = subsystem_open, - .read = system_enable_read, - .write = system_enable_write, - .llseek = default_llseek, - .release = subsystem_release, -}; - -static const struct file_operations ftrace_show_header_fops = { - .open = tracing_open_generic, - .read = show_header, - .llseek = default_llseek, -}; - -static struct dentry *event_trace_events_dir(void) -{ - static struct dentry *d_tracer; - static struct dentry *d_events; - - if (d_events) - return d_events; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return NULL; - - d_events = debugfs_create_dir("events", d_tracer); - if (!d_events) - pr_warning("Could not create debugfs " - "'events' directory\n"); - - return d_events; -} - -static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) -{ - struct event_subsystem *system; - struct dentry *entry; - - /* First see if we did not already create this dir */ - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - system->nr_events++; - return system->entry; - } - } - - /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); - if (!system) { - pr_warning("No memory to create event subsystem %s\n", - name); - return d_events; - } - - system->entry = debugfs_create_dir(name, d_events); - if (!system->entry) { - pr_warning("Could not create event subsystem %s\n", - name); - kfree(system); - return d_events; - } - - system->nr_events = 1; - system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) { - debugfs_remove(system->entry); - kfree(system); - return d_events; - } - - list_add(&system->list, &event_subsystems); - - system->filter = NULL; - - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); - if (!system->filter) { - pr_warning("Could not allocate filter for subsystem " - "'%s'\n", name); - return system->entry; - } - - entry = debugfs_create_file("filter", 0644, system->entry, system, - &ftrace_subsystem_filter_fops); - if (!entry) { - kfree(system->filter); - system->filter = NULL; - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); - } - - trace_create_file("enable", 0644, system->entry, system, - &ftrace_system_enable_fops); - - return system->entry; -} - -static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) -{ - struct list_head *head; - int ret; - - /* - * If the trace point header did not define TRACE_SYSTEM - * then the system would be called "TRACE_SYSTEM". - */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->class->system, d_events); - - call->dir = debugfs_create_dir(call->name, d_events); - if (!call->dir) { - pr_warning("Could not create debugfs " - "'%s' directory\n", call->name); - return -1; - } - - if (call->class->reg) - trace_create_file("enable", 0644, call->dir, call, - enable); - -#ifdef CONFIG_PERF_EVENTS - if (call->event.type && call->class->reg) - trace_create_file("id", 0444, call->dir, call, - id); -#endif - - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - ret = call->class->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } - } - trace_create_file("filter", 0644, call->dir, call, - filter); - - trace_create_file("format", 0444, call->dir, call, - format); - - return 0; -} - -static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) -{ - struct dentry *d_events; - int ret; - - /* The linker may leave blanks */ - if (!call->name) - return -EINVAL; - - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } - } - - d_events = event_trace_events_dir(); - if (!d_events) - return -ENOENT; - - ret = event_create_dir(call, d_events, id, enable, filter, format); - if (!ret) - list_add(&call->list, &ftrace_events); - call->mod = mod; - - return ret; -} - -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct ftrace_event_call *call) -{ - int ret; - mutex_lock(&event_mutex); - ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - mutex_unlock(&event_mutex); - return ret; -} - -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; - - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - debugfs_remove_recursive(system->entry); - list_del(&system->list); - __put_system(system); - } - break; - } - } -} - -/* - * Must be called under locking both of event_mutex and trace_event_mutex. - */ -static void __trace_remove_event_call(struct ftrace_event_call *call) -{ - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->class->system); -} - -/* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) -{ - mutex_lock(&event_mutex); - down_write(&trace_event_mutex); - __trace_remove_event_call(call); - up_write(&trace_event_mutex); - mutex_unlock(&event_mutex); -} - -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) - -#ifdef CONFIG_MODULES - -static LIST_HEAD(ftrace_module_file_list); - -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; - -static struct ftrace_module_file_ops * -trace_create_file_ops(struct module *mod) -{ - struct ftrace_module_file_ops *file_ops; - - /* - * This is a bit of a PITA. To allow for correct reference - * counting, modules must "own" their file_operations. - * To do this, we allocate the file operations that will be - * used in the event directory. - */ - - file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); - if (!file_ops) - return NULL; - - file_ops->mod = mod; - - file_ops->id = ftrace_event_id_fops; - file_ops->id.owner = mod; - - file_ops->enable = ftrace_enable_fops; - file_ops->enable.owner = mod; - - file_ops->filter = ftrace_event_filter_fops; - file_ops->filter.owner = mod; - - file_ops->format = ftrace_event_format_fops; - file_ops->format.owner = mod; - - list_add(&file_ops->list, &ftrace_module_file_list); - - return file_ops; -} - -static void trace_module_add_events(struct module *mod) -{ - struct ftrace_module_file_ops *file_ops = NULL; - struct ftrace_event_call **call, **start, **end; - - start = mod->trace_events; - end = mod->trace_events + mod->num_trace_events; - - if (start == end) - return; - - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - - for_each_event(call, start, end) { - __trace_add_event_call(*call, mod, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); - } -} - -static void trace_module_remove_events(struct module *mod) -{ - struct ftrace_module_file_ops *file_ops; - struct ftrace_event_call *call, *p; - bool found = false; - - down_write(&trace_event_mutex); - list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) { - found = true; - __trace_remove_event_call(call); - } - } - - /* Now free the file_operations */ - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - break; - } - if (&file_ops->list != &ftrace_module_file_list) { - list_del(&file_ops->list); - kfree(file_ops); - } - - /* - * It is safest to reset the ring buffer if the module being unloaded - * registered any events. - */ - if (found) - tracing_reset_current_online_cpus(); - up_write(&trace_event_mutex); -} - -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - mutex_lock(&event_mutex); - switch (val) { - case MODULE_STATE_COMING: - trace_module_add_events(mod); - break; - case MODULE_STATE_GOING: - trace_module_remove_events(mod); - break; - } - mutex_unlock(&event_mutex); - - return 0; -} -#else -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -#endif /* CONFIG_MODULES */ - -static struct notifier_block trace_module_nb = { - .notifier_call = trace_module_notify, - .priority = 0, -}; - -extern struct ftrace_event_call *__start_ftrace_events[]; -extern struct ftrace_event_call *__stop_ftrace_events[]; - -static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; - -static __init int setup_trace_event(char *str) -{ - strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = 1; - tracing_selftest_disabled = 1; - - return 1; -} -__setup("trace_event=", setup_trace_event); - -static __init int event_trace_init(void) -{ - struct ftrace_event_call **call; - struct dentry *d_tracer; - struct dentry *entry; - struct dentry *d_events; - int ret; - char *buf = bootup_event_buf; - char *token; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; - - entry = debugfs_create_file("available_events", 0444, d_tracer, - (void *)&show_event_seq_ops, - &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); - - entry = debugfs_create_file("set_event", 0644, d_tracer, - (void *)&show_set_event_seq_ops, - &ftrace_set_event_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_event' entry\n"); - - d_events = event_trace_events_dir(); - if (!d_events) - return 0; - - /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - trace_create_file("enable", 0644, d_events, - NULL, &ftrace_system_enable_fops); - - if (trace_define_common_fields()) - pr_warning("tracing: Failed to allocate common fields"); - - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - } - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(token, 1); - if (ret) - pr_warning("Failed to enable trace event: %s\n", token); - } - - ret = register_module_notifier(&trace_module_nb); - if (ret) - pr_warning("Failed to register trace events module notifier\n"); - - return 0; -} -fs_initcall(event_trace_init); - -#ifdef CONFIG_FTRACE_STARTUP_TEST - -static DEFINE_SPINLOCK(test_spinlock); -static DEFINE_SPINLOCK(test_spinlock_irq); -static DEFINE_MUTEX(test_mutex); - -static __init void test_work(struct work_struct *dummy) -{ - spin_lock(&test_spinlock); - spin_lock_irq(&test_spinlock_irq); - udelay(1); - spin_unlock_irq(&test_spinlock_irq); - spin_unlock(&test_spinlock); - - mutex_lock(&test_mutex); - msleep(1); - mutex_unlock(&test_mutex); -} - -static __init int event_test_thread(void *unused) -{ - void *test_malloc; - - test_malloc = kmalloc(1234, GFP_KERNEL); - if (!test_malloc) - pr_info("failed to kmalloc\n"); - - schedule_on_each_cpu(test_work); - - kfree(test_malloc); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) - schedule(); - - return 0; -} - -/* - * Do various things that may trigger events. - */ -static __init void event_test_stuff(void) -{ - struct task_struct *test_thread; - - test_thread = kthread_run(event_test_thread, NULL, "test-events"); - msleep(1); - kthread_stop(test_thread); -} - -/* - * For every trace event defined, we will test each trace point separately, - * and then by groups, and finally all trace points. - */ -static __init void event_trace_self_tests(void) -{ - struct ftrace_event_call *call; - struct event_subsystem *system; - int ret; - - pr_info("Running tests on trace events:\n"); - - list_for_each_entry(call, &ftrace_events, list) { - - /* Only test those that have a probe */ - if (!call->class || !call->class->probe) - continue; - -/* - * Testing syscall events here is pretty useless, but - * we still do it if configured. But this is time consuming. - * What we really need is a user thread to perform the - * syscalls as we test. - */ -#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS - if (call->class->system && - strcmp(call->class->system, "syscalls") == 0) - continue; -#endif - - pr_info("Testing event %s: ", call->name); - - /* - * If an event is already enabled, someone is using - * it and the self test should not be on. - */ - if (call->flags & TRACE_EVENT_FL_ENABLED) { - pr_warning("Enabled event during self test!\n"); - WARN_ON_ONCE(1); - continue; - } - - ftrace_event_enable_disable(call, 1); - event_test_stuff(); - ftrace_event_enable_disable(call, 0); - - pr_cont("OK\n"); - } - - /* Now test at the sub system level */ - - pr_info("Running tests on trace event systems:\n"); - - list_for_each_entry(system, &event_subsystems, list) { - - /* the ftrace system is special, skip it */ - if (strcmp(system->name, "ftrace") == 0) - continue; - - pr_info("Testing event system %s: ", system->name); - - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); - if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling system %s\n", - system->name); - continue; - } - - event_test_stuff(); - - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) - pr_warning("error disabling system %s\n", - system->name); - - pr_cont("OK\n"); - } - - /* Test with all events enabled */ - - pr_info("Running tests on all trace events:\n"); - pr_info("Testing all events: "); - - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); - if (WARN_ON_ONCE(ret)) { - pr_warning("error enabling all events\n"); - return; - } - - event_test_stuff(); - - /* reset sysname */ - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); - if (WARN_ON_ONCE(ret)) { - pr_warning("error disabling all events\n"); - return; - } - - pr_cont("OK\n"); -} - -#ifdef CONFIG_FUNCTION_TRACER - -static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); - -static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) -{ - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct ftrace_entry *entry; - unsigned long flags; - long disabled; - int cpu; - int pc; - - pc = preempt_count(); - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); - - if (disabled != 1) - goto out; - - local_save_flags(flags); - - event = trace_current_buffer_lock_reserve(&buffer, - TRACE_FN, sizeof(*entry), - flags, pc); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->parent_ip = parent_ip; - - trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); - - out: - atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); - preempt_enable_notrace(); -} - -static struct ftrace_ops trace_ops __initdata = -{ - .func = function_test_events_call, -}; - -static __init void event_trace_self_test_with_function(void) -{ - int ret; - ret = register_ftrace_function(&trace_ops); - if (WARN_ON(ret < 0)) { - pr_info("Failed to enable function tracer for event tests\n"); - return; - } - pr_info("Running tests again, along with the function tracer\n"); - event_trace_self_tests(); - unregister_ftrace_function(&trace_ops); -} -#else -static __init void event_trace_self_test_with_function(void) -{ -} -#endif - -static __init int event_trace_self_tests_init(void) -{ - if (!tracing_selftest_disabled) { - event_trace_self_tests(); - event_trace_self_test_with_function(); - } - - return 0; -} - -late_initcall(event_trace_self_tests_init); - -#endif -/* - * trace_events_filter - generic event filtering - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) 2009 Tom Zanussi - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -#define DEFAULT_SYS_FILTER_MESSAGE \ - "### global filter ###\n" \ - "# Use this to set filters for multiple events.\n" \ - "# Only events with the given fields will be affected.\n" \ - "# If no events are modified, an error message will be displayed here" - -enum filter_op_ids -{ - OP_OR, - OP_AND, - OP_GLOB, - OP_NE, - OP_EQ, - OP_LT, - OP_LE, - OP_GT, - OP_GE, - OP_NONE, - OP_OPEN_PAREN, -}; - -struct filter_op { - int id; - char *string; - int precedence; -}; - -static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_GLOB, "~", 4 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, -}; - -enum { - FILT_ERR_NONE, - FILT_ERR_INVALID_OP, - FILT_ERR_UNBALANCED_PAREN, - FILT_ERR_TOO_MANY_OPERANDS, - FILT_ERR_OPERAND_TOO_LONG, - FILT_ERR_FIELD_NOT_FOUND, - FILT_ERR_ILLEGAL_FIELD_OP, - FILT_ERR_ILLEGAL_INTVAL, - FILT_ERR_BAD_SUBSYS_FILTER, - FILT_ERR_TOO_MANY_PREDS, - FILT_ERR_MISSING_FIELD, - FILT_ERR_INVALID_FILTER, -}; - -static char *err_text[] = { - "No error", - "Invalid operator", - "Unbalanced parens", - "Too many operands", - "Operand too long", - "Field not found", - "Illegal operation for field type", - "Illegal integer value", - "Couldn't find or set field in one of a subsystem's events", - "Too many terms in predicate expression", - "Missing field name and/or value", - "Meaningless filter expression", -}; - -struct opstack_op { - int op; - struct list_head list; -}; - -struct postfix_elt { - int op; - char *operand; - struct list_head list; -}; - -struct filter_parse_state { - struct filter_op *ops; - struct list_head opstack; - struct list_head postfix; - int lasterr; - int lasterr_pos; - - struct { - char *string; - unsigned int cnt; - unsigned int tail; - } infix; - - struct { - char string[MAX_FILTER_STR_VAL]; - int pos; - unsigned int tail; - } operand; -}; - -struct pred_stack { - struct filter_pred **preds; - int index; -}; - -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - -DEFINE_COMPARISON_PRED(s64); -DEFINE_COMPARISON_PRED(u64); -DEFINE_COMPARISON_PRED(s32); -DEFINE_COMPARISON_PRED(u32); -DEFINE_COMPARISON_PRED(s16); -DEFINE_COMPARISON_PRED(u16); -DEFINE_COMPARISON_PRED(s8); -DEFINE_COMPARISON_PRED(u8); - -DEFINE_EQUALITY_PRED(64); -DEFINE_EQUALITY_PRED(32); -DEFINE_EQUALITY_PRED(16); -DEFINE_EQUALITY_PRED(8); - -/* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event) -{ - char *addr = (char *)(event + pred->offset); - int cmp, match; - - cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - - match = cmp ^ pred->not; - - return match; -} - -/* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event) -{ - char **addr = (char **)(event + pred->offset); - int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ - - cmp = pred->regex.match(*addr, &pred->regex, len); - - match = cmp ^ pred->not; - - return match; -} - -/* - * Filter predicate for dynamic sized arrays of characters. - * These are implemented through a list of strings at the end - * of the entry. - * Also each of these strings have a field in the entry which - * contains its offset from the beginning of the entry. - * We have then first to get this field, dereference it - * and add it to the address of the entry, and at last we have - * the address of the string. - */ -static int filter_pred_strloc(struct filter_pred *pred, void *event) -{ - u32 str_item = *(u32 *)(event + pred->offset); - int str_loc = str_item & 0xffff; - int str_len = str_item >> 16; - char *addr = (char *)(event + str_loc); - int cmp, match; - - cmp = pred->regex.match(addr, &pred->regex, str_len); - - match = cmp ^ pred->not; - - return match; -} - -static int filter_pred_none(struct filter_pred *pred, void *event) -{ - return 0; -} - -/* - * regex_match_foo - Basic regex callbacks - * - * @str: the string to be searched - * @r: the regex structure containing the pattern string - * @len: the length of the string to be searched (including '\0') - * - * Note: - * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING - */ - -static int regex_match_full(char *str, struct regex *r, int len) -{ - if (strncmp(str, r->pattern, len) == 0) - return 1; - return 0; -} - -static int regex_match_front(char *str, struct regex *r, int len) -{ - if (strncmp(str, r->pattern, r->len) == 0) - return 1; - return 0; -} - -static int regex_match_middle(char *str, struct regex *r, int len) -{ - if (strnstr(str, r->pattern, len)) - return 1; - return 0; -} - -static int regex_match_end(char *str, struct regex *r, int len) -{ - int strlen = len - 1; - - if (strlen >= r->len && - memcmp(str + strlen - r->len, r->pattern, r->len) == 0) - return 1; - return 0; -} - -/** - * filter_parse_regex - parse a basic regex - * @buff: the raw regex - * @len: length of the regex - * @search: will point to the beginning of the string to compare - * @not: tell whether the match will have to be inverted - * - * This passes in a buffer containing a regex and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - -static void filter_build_regex(struct filter_pred *pred) -{ - struct regex *r = &pred->regex; - char *search; - enum regex_type type = MATCH_FULL; - int not = 0; - - if (pred->op == OP_GLOB) { - type = filter_parse_regex(r->pattern, r->len, &search, ¬); - r->len = strlen(search); - memmove(r->pattern, search, r->len+1); - } - - switch (type) { - case MATCH_FULL: - r->match = regex_match_full; - break; - case MATCH_FRONT_ONLY: - r->match = regex_match_front; - break; - case MATCH_MIDDLE_ONLY: - r->match = regex_match_middle; - break; - case MATCH_END_ONLY: - r->match = regex_match_end; - break; - } - - pred->not ^= not; -} - -enum move_type { - MOVE_DOWN, - MOVE_UP_FROM_LEFT, - MOVE_UP_FROM_RIGHT -}; - -static struct filter_pred * -get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, - int index, enum move_type *move) -{ - if (pred->parent & FILTER_PRED_IS_RIGHT) - *move = MOVE_UP_FROM_RIGHT; - else - *move = MOVE_UP_FROM_LEFT; - pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; - - return pred; -} - -enum walk_return { - WALK_PRED_ABORT, - WALK_PRED_PARENT, - WALK_PRED_DEFAULT, -}; - -typedef int (*filter_pred_walkcb_t) (enum move_type move, - struct filter_pred *pred, - int *err, void *data); - -static int walk_pred_tree(struct filter_pred *preds, - struct filter_pred *root, - filter_pred_walkcb_t cb, void *data) -{ - struct filter_pred *pred = root; - enum move_type move = MOVE_DOWN; - int done = 0; - - if (!preds) - return -EINVAL; - - do { - int err = 0, ret; - - ret = cb(move, pred, &err, data); - if (ret == WALK_PRED_ABORT) - return err; - if (ret == WALK_PRED_PARENT) - goto get_parent; - - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - goto get_parent; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - get_parent: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, - &move); - continue; - } - done = 1; - } while (!done); - - /* We are fine. */ - return 0; -} - -/* - * A series of AND or ORs where found together. Instead of - * climbing up and down the tree branches, an array of the - * ops were made in order of checks. We can just move across - * the array and short circuit if needed. - */ -static int process_ops(struct filter_pred *preds, - struct filter_pred *op, void *rec) -{ - struct filter_pred *pred; - int match = 0; - int type; - int i; - - /* - * Micro-optimization: We set type to true if op - * is an OR and false otherwise (AND). Then we - * just need to test if the match is equal to - * the type, and if it is, we can short circuit the - * rest of the checks: - * - * if ((match && op->op == OP_OR) || - * (!match && op->op == OP_AND)) - * return match; - */ - type = op->op == OP_OR; - - for (i = 0; i < op->val; i++) { - pred = &preds[op->ops[i]]; - if (!WARN_ON_ONCE(!pred->fn)) - match = pred->fn(pred, rec); - if (!!match == type) - return match; - } - return match; -} - -struct filter_match_preds_data { - struct filter_pred *preds; - int match; - void *rec; -}; - -static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_match_preds_data *d = data; - - *err = 0; - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) - return WALK_PRED_DEFAULT; - /* We can treat folded ops as a leaf node */ - d->match = process_ops(d->preds, pred, d->rec); - } else { - if (!WARN_ON_ONCE(!pred->fn)) - d->match = pred->fn(pred, d->rec); - } - - return WALK_PRED_PARENT; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!d->match == (pred->op == OP_OR)) - return WALK_PRED_PARENT; - break; - case MOVE_UP_FROM_RIGHT: - break; - } - - return WALK_PRED_DEFAULT; -} - -/* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct event_filter *filter, void *rec) -{ - struct filter_pred *preds; - struct filter_pred *root; - struct filter_match_preds_data data = { - /* match is currently meaningless */ - .match = -1, - .rec = rec, - }; - int n_preds, ret; - - /* no filter is considered a match */ - if (!filter) - return 1; - - n_preds = filter->n_preds; - if (!n_preds) - return 1; - - /* - * n_preds, root and filter->preds are protect with preemption disabled. - */ - root = rcu_dereference_sched(filter->root); - if (!root) - return 1; - - data.preds = preds = rcu_dereference_sched(filter->preds); - ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); - WARN_ON(ret); - return data.match; -} -EXPORT_SYMBOL_GPL(filter_match_preds); - -static void parse_error(struct filter_parse_state *ps, int err, int pos) -{ - ps->lasterr = err; - ps->lasterr_pos = pos; -} - -static void remove_filter_string(struct event_filter *filter) -{ - if (!filter) - return; - - kfree(filter->filter_string); - filter->filter_string = NULL; -} - -static int replace_filter_string(struct event_filter *filter, - char *filter_string) -{ - kfree(filter->filter_string); - filter->filter_string = kstrdup(filter_string, GFP_KERNEL); - if (!filter->filter_string) - return -ENOMEM; - - return 0; -} - -static int append_filter_string(struct event_filter *filter, - char *string) -{ - int newlen; - char *new_filter_string; - - BUG_ON(!filter->filter_string); - newlen = strlen(filter->filter_string) + strlen(string) + 1; - new_filter_string = kmalloc(newlen, GFP_KERNEL); - if (!new_filter_string) - return -ENOMEM; - - strcpy(new_filter_string, filter->filter_string); - strcat(new_filter_string, string); - kfree(filter->filter_string); - filter->filter_string = new_filter_string; - - return 0; -} - -static void append_filter_err(struct filter_parse_state *ps, - struct event_filter *filter) -{ - int pos = ps->lasterr_pos; - char *buf, *pbuf; - - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return; - - append_filter_string(filter, "\n"); - memset(buf, ' ', PAGE_SIZE); - if (pos > PAGE_SIZE - 128) - pos = 0; - buf[pos] = '^'; - pbuf = &buf[pos] + 1; - - sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); - append_filter_string(filter, buf); - free_page((unsigned long) buf); -} - -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) -{ - struct event_filter *filter; - - mutex_lock(&event_mutex); - filter = call->filter; - if (filter && filter->filter_string) - trace_seq_printf(s, "%s\n", filter->filter_string); - else - trace_seq_printf(s, "none\n"); - mutex_unlock(&event_mutex); -} - -void print_subsystem_event_filter(struct event_subsystem *system, - struct trace_seq *s) -{ - struct event_filter *filter; - - mutex_lock(&event_mutex); - filter = system->filter; - if (filter && filter->filter_string) - trace_seq_printf(s, "%s\n", filter->filter_string); - else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); - mutex_unlock(&event_mutex); -} - -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ - struct ftrace_event_field *field; - - list_for_each_entry(field, head, link) { - if (!strcmp(field->name, name)) - return field; - } - - return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ - struct ftrace_event_field *field; - struct list_head *head; - - field = __find_event_field(&ftrace_common_fields, name); - if (field) - return field; - - head = trace_get_fields(call); - return __find_event_field(head, name); -} - -static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) -{ - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); - if (!stack->preds) - return -ENOMEM; - stack->index = n_preds; - return 0; -} - -static void __free_pred_stack(struct pred_stack *stack) -{ - kfree(stack->preds); - stack->index = 0; -} - -static int __push_pred_stack(struct pred_stack *stack, - struct filter_pred *pred) -{ - int index = stack->index; - - if (WARN_ON(index == 0)) - return -ENOSPC; - - stack->preds[--index] = pred; - stack->index = index; - return 0; -} - -static struct filter_pred * -__pop_pred_stack(struct pred_stack *stack) -{ - struct filter_pred *pred; - int index = stack->index; - - pred = stack->preds[index++]; - if (!pred) - return NULL; - - stack->index = index; - return pred; -} - -static int filter_set_pred(struct event_filter *filter, - int idx, - struct pred_stack *stack, - struct filter_pred *src) -{ - struct filter_pred *dest = &filter->preds[idx]; - struct filter_pred *left; - struct filter_pred *right; - - *dest = *src; - dest->index = idx; - - if (dest->op == OP_OR || dest->op == OP_AND) { - right = __pop_pred_stack(stack); - left = __pop_pred_stack(stack); - if (!left || !right) - return -EINVAL; - /* - * If both children can be folded - * and they are the same op as this op or a leaf, - * then this op can be folded. - */ - if (left->index & FILTER_PRED_FOLD && - (left->op == dest->op || - left->left == FILTER_PRED_INVALID) && - right->index & FILTER_PRED_FOLD && - (right->op == dest->op || - right->left == FILTER_PRED_INVALID)) - dest->index |= FILTER_PRED_FOLD; - - dest->left = left->index & ~FILTER_PRED_FOLD; - dest->right = right->index & ~FILTER_PRED_FOLD; - left->parent = dest->index & ~FILTER_PRED_FOLD; - right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else { - /* - * Make dest->left invalid to be used as a quick - * way to know this is a leaf node. - */ - dest->left = FILTER_PRED_INVALID; - - /* All leafs allow folding the parent ops. */ - dest->index |= FILTER_PRED_FOLD; - } - - return __push_pred_stack(stack, dest); -} - -static void __free_preds(struct event_filter *filter) -{ - if (filter->preds) { - kfree(filter->preds); - filter->preds = NULL; - } - filter->a_preds = 0; - filter->n_preds = 0; -} - -static void filter_disable(struct ftrace_event_call *call) -{ - call->flags &= ~TRACE_EVENT_FL_FILTERED; -} - -static void __free_filter(struct event_filter *filter) -{ - if (!filter) - return; - - __free_preds(filter); - kfree(filter->filter_string); - kfree(filter); -} - -/* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing - * the tracepoints from within it. - */ -void destroy_preds(struct ftrace_event_call *call) -{ - __free_filter(call->filter); - call->filter = NULL; -} - -static struct event_filter *__alloc_filter(void) -{ - struct event_filter *filter; - - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - return filter; -} - -static int __alloc_preds(struct event_filter *filter, int n_preds) -{ - struct filter_pred *pred; - int i; - - if (filter->preds) - __free_preds(filter); - - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - - filter->a_preds = n_preds; - filter->n_preds = 0; - - for (i = 0; i < n_preds; i++) { - pred = &filter->preds[i]; - pred->fn = filter_pred_none; - } - - return 0; -} - -static void filter_free_subsystem_preds(struct event_subsystem *system) -{ - struct ftrace_event_call *call; - - list_for_each_entry(call, &ftrace_events, list) { - if (strcmp(call->class->system, system->name) != 0) - continue; - - filter_disable(call); - remove_filter_string(call->filter); - } -} - -static void filter_free_subsystem_filters(struct event_subsystem *system) -{ - struct ftrace_event_call *call; - - list_for_each_entry(call, &ftrace_events, list) { - if (strcmp(call->class->system, system->name) != 0) - continue; - __free_filter(call->filter); - call->filter = NULL; - } -} - -static int filter_add_pred(struct filter_parse_state *ps, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack) -{ - int err; - - if (WARN_ON(filter->n_preds == filter->a_preds)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = filter_set_pred(filter, filter->n_preds, stack, pred); - if (err) - return err; - - filter->n_preds++; - - return 0; -} - -int filter_assign_type(const char *type) -{ - if (strstr(type, "__data_loc") && strstr(type, "char")) - return FILTER_DYN_STRING; - - if (strchr(type, '[') && strstr(type, "char")) - return FILTER_STATIC_STRING; - - return FILTER_OTHER; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - -static int is_legal_op(struct ftrace_event_field *field, int op) -{ - if (is_string_field(field) && - (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return 0; - if (!is_string_field(field) && op == OP_GLOB) - return 0; - - return 1; -} - -static filter_pred_fn_t select_comparison_fn(int op, int field_size, - int field_is_signed) -{ - filter_pred_fn_t fn = NULL; - - switch (field_size) { - case 8: - if (op == OP_EQ || op == OP_NE) - fn = filter_pred_64; - else if (field_is_signed) - fn = filter_pred_s64; - else - fn = filter_pred_u64; - break; - case 4: - if (op == OP_EQ || op == OP_NE) - fn = filter_pred_32; - else if (field_is_signed) - fn = filter_pred_s32; - else - fn = filter_pred_u32; - break; - case 2: - if (op == OP_EQ || op == OP_NE) - fn = filter_pred_16; - else if (field_is_signed) - fn = filter_pred_s16; - else - fn = filter_pred_u16; - break; - case 1: - if (op == OP_EQ || op == OP_NE) - fn = filter_pred_8; - else if (field_is_signed) - fn = filter_pred_s8; - else - fn = filter_pred_u8; - break; - } - - return fn; -} - -static int init_pred(struct filter_parse_state *ps, - struct ftrace_event_field *field, - struct filter_pred *pred) - -{ - filter_pred_fn_t fn = filter_pred_none; - unsigned long long val; - int ret; - - pred->offset = field->offset; - - if (!is_legal_op(field, pred->op)) { - parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); - return -EINVAL; - } - - if (is_string_field(field)) { - filter_build_regex(pred); - - if (field->filter_type == FILTER_STATIC_STRING) { - fn = filter_pred_string; - pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; - else - fn = filter_pred_pchar; - } else { - if (field->is_signed) - ret = strict_strtoll(pred->regex.pattern, 0, &val); - else - ret = strict_strtoull(pred->regex.pattern, 0, &val); - if (ret) { - parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); - return -EINVAL; - } - pred->val = val; - - fn = select_comparison_fn(pred->op, field->size, - field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - } - - if (pred->op == OP_NE) - pred->not = 1; - - pred->fn = fn; - return 0; -} - -static void parse_init(struct filter_parse_state *ps, - struct filter_op *ops, - char *infix_string) -{ - memset(ps, '\0', sizeof(*ps)); - - ps->infix.string = infix_string; - ps->infix.cnt = strlen(infix_string); - ps->ops = ops; - - INIT_LIST_HEAD(&ps->opstack); - INIT_LIST_HEAD(&ps->postfix); -} - -static char infix_next(struct filter_parse_state *ps) -{ - ps->infix.cnt--; - - return ps->infix.string[ps->infix.tail++]; -} - -static char infix_peek(struct filter_parse_state *ps) -{ - if (ps->infix.tail == strlen(ps->infix.string)) - return 0; - - return ps->infix.string[ps->infix.tail]; -} - -static void infix_advance(struct filter_parse_state *ps) -{ - ps->infix.cnt--; - ps->infix.tail++; -} - -static inline int is_precedence_lower(struct filter_parse_state *ps, - int a, int b) -{ - return ps->ops[a].precedence < ps->ops[b].precedence; -} - -static inline int is_op_char(struct filter_parse_state *ps, char c) -{ - int i; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (ps->ops[i].string[0] == c) - return 1; - } - - return 0; -} - -static int infix_get_op(struct filter_parse_state *ps, char firstc) -{ - char nextc = infix_peek(ps); - char opstr[3]; - int i; - - opstr[0] = firstc; - opstr[1] = nextc; - opstr[2] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) { - infix_advance(ps); - return ps->ops[i].id; - } - } - - opstr[1] = '\0'; - - for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { - if (!strcmp(opstr, ps->ops[i].string)) - return ps->ops[i].id; - } - - return OP_NONE; -} - -static inline void clear_operand_string(struct filter_parse_state *ps) -{ - memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); - ps->operand.tail = 0; -} - -static inline int append_operand_char(struct filter_parse_state *ps, char c) -{ - if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) - return -EINVAL; - - ps->operand.string[ps->operand.tail++] = c; - - return 0; -} - -static int filter_opstack_push(struct filter_parse_state *ps, int op) -{ - struct opstack_op *opstack_op; - - opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); - if (!opstack_op) - return -ENOMEM; - - opstack_op->op = op; - list_add(&opstack_op->list, &ps->opstack); - - return 0; -} - -static int filter_opstack_empty(struct filter_parse_state *ps) -{ - return list_empty(&ps->opstack); -} - -static int filter_opstack_top(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - - if (filter_opstack_empty(ps)) - return OP_NONE; - - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - - return opstack_op->op; -} - -static int filter_opstack_pop(struct filter_parse_state *ps) -{ - struct opstack_op *opstack_op; - int op; - - if (filter_opstack_empty(ps)) - return OP_NONE; - - opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); - op = opstack_op->op; - list_del(&opstack_op->list); - - kfree(opstack_op); - - return op; -} - -static void filter_opstack_clear(struct filter_parse_state *ps) -{ - while (!filter_opstack_empty(ps)) - filter_opstack_pop(ps); -} - -static char *curr_operand(struct filter_parse_state *ps) -{ - return ps->operand.string; -} - -static int postfix_append_operand(struct filter_parse_state *ps, char *operand) -{ - struct postfix_elt *elt; - - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; - - elt->op = OP_NONE; - elt->operand = kstrdup(operand, GFP_KERNEL); - if (!elt->operand) { - kfree(elt); - return -ENOMEM; - } - - list_add_tail(&elt->list, &ps->postfix); - - return 0; -} - -static int postfix_append_op(struct filter_parse_state *ps, int op) -{ - struct postfix_elt *elt; - - elt = kmalloc(sizeof(*elt), GFP_KERNEL); - if (!elt) - return -ENOMEM; - - elt->op = op; - elt->operand = NULL; - - list_add_tail(&elt->list, &ps->postfix); - - return 0; -} - -static void postfix_clear(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - - while (!list_empty(&ps->postfix)) { - elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - list_del(&elt->list); - kfree(elt->operand); - kfree(elt); - } -} - -static int filter_parse(struct filter_parse_state *ps) -{ - int in_string = 0; - int op, top_op; - char ch; - - while ((ch = infix_next(ps))) { - if (ch == '"') { - in_string ^= 1; - continue; - } - - if (in_string) - goto parse_operand; - - if (isspace(ch)) - continue; - - if (is_op_char(ps, ch)) { - op = infix_get_op(ps, ch); - if (op == OP_NONE) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; - } - - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_top(ps); - if (!is_precedence_lower(ps, top_op, op)) { - top_op = filter_opstack_pop(ps); - postfix_append_op(ps, top_op); - continue; - } - break; - } - - filter_opstack_push(ps, op); - continue; - } - - if (ch == '(') { - filter_opstack_push(ps, OP_OPEN_PAREN); - continue; - } - - if (ch == ')') { - if (strlen(curr_operand(ps))) { - postfix_append_operand(ps, curr_operand(ps)); - clear_operand_string(ps); - } - - top_op = filter_opstack_pop(ps); - while (top_op != OP_NONE) { - if (top_op == OP_OPEN_PAREN) - break; - postfix_append_op(ps, top_op); - top_op = filter_opstack_pop(ps); - } - if (top_op == OP_NONE) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; - } - continue; - } -parse_operand: - if (append_operand_char(ps, ch)) { - parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); - return -EINVAL; - } - } - - if (strlen(curr_operand(ps))) - postfix_append_operand(ps, curr_operand(ps)); - - while (!filter_opstack_empty(ps)) { - top_op = filter_opstack_pop(ps); - if (top_op == OP_NONE) - break; - if (top_op == OP_OPEN_PAREN) { - parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); - return -EINVAL; - } - postfix_append_op(ps, top_op); - } - - return 0; -} - -static struct filter_pred *create_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, - int op, char *operand1, char *operand2) -{ - struct ftrace_event_field *field; - static struct filter_pred pred; - - memset(&pred, 0, sizeof(pred)); - pred.op = op; - - if (op == OP_AND || op == OP_OR) - return &pred; - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return NULL; - } - - field = find_event_field(call, operand1); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return NULL; - } - - strcpy(pred.regex.pattern, operand2); - pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST - pred.field = field; -#endif - return init_pred(ps, field, &pred) ? NULL : &pred; -} - -static int check_preds(struct filter_parse_state *ps) -{ - int n_normal_preds = 0, n_logical_preds = 0; - struct postfix_elt *elt; - - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) - continue; - - if (elt->op == OP_AND || elt->op == OP_OR) { - n_logical_preds++; - continue; - } - n_normal_preds++; - } - - if (!n_normal_preds || n_logical_preds >= n_normal_preds) { - parse_error(ps, FILT_ERR_INVALID_FILTER, 0); - return -EINVAL; - } - - return 0; -} - -static int count_preds(struct filter_parse_state *ps) -{ - struct postfix_elt *elt; - int n_preds = 0; - - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) - continue; - n_preds++; - } - - return n_preds; -} - -struct check_pred_data { - int count; - int max; -}; - -static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct check_pred_data *d = data; - - if (WARN_ON(d->count++ > d->max)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - return WALK_PRED_DEFAULT; -} - -/* - * The tree is walked at filtering of an event. If the tree is not correctly - * built, it may cause an infinite loop. Check here that the tree does - * indeed terminate. - */ -static int check_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - struct check_pred_data data = { - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - .max = 3 * filter->n_preds, - .count = 0, - }; - - return walk_pred_tree(filter->preds, root, - check_pred_tree_cb, &data); -} - -static int count_leafs_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - int *count = data; - - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) - (*count)++; - - return WALK_PRED_DEFAULT; -} - -static int count_leafs(struct filter_pred *preds, struct filter_pred *root) -{ - int count = 0, ret; - - ret = walk_pred_tree(preds, root, count_leafs_cb, &count); - WARN_ON(ret); - return count; -} - -struct fold_pred_data { - struct filter_pred *root; - int count; - int children; -}; - -static int fold_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct fold_pred_data *d = data; - struct filter_pred *root = d->root; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (pred->left != FILTER_PRED_INVALID) - return WALK_PRED_DEFAULT; - - if (WARN_ON(d->count == d->children)) { - *err = -EINVAL; - return WALK_PRED_ABORT; - } - - pred->index &= ~FILTER_PRED_FOLD; - root->ops[d->count++] = pred->index; - return WALK_PRED_DEFAULT; -} - -static int fold_pred(struct filter_pred *preds, struct filter_pred *root) -{ - struct fold_pred_data data = { - .root = root, - .count = 0, - }; - int children; - - /* No need to keep the fold flag */ - root->index &= ~FILTER_PRED_FOLD; - - /* If the root is a leaf then do nothing */ - if (root->left == FILTER_PRED_INVALID) - return 0; - - /* count the children */ - children = count_leafs(preds, &preds[root->left]); - children += count_leafs(preds, &preds[root->right]); - - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); - if (!root->ops) - return -ENOMEM; - - root->val = children; - data.children = children; - return walk_pred_tree(preds, root, fold_pred_cb, &data); -} - -static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - struct filter_pred *preds = data; - - if (move != MOVE_DOWN) - return WALK_PRED_DEFAULT; - if (!(pred->index & FILTER_PRED_FOLD)) - return WALK_PRED_DEFAULT; - - *err = fold_pred(preds, pred); - if (*err) - return WALK_PRED_ABORT; - - /* eveyrhing below is folded, continue with parent */ - return WALK_PRED_PARENT; -} - -/* - * To optimize the processing of the ops, if we have several "ors" or - * "ands" together, we can put them in an array and process them all - * together speeding up the filter logic. - */ -static int fold_pred_tree(struct event_filter *filter, - struct filter_pred *root) -{ - return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, - filter->preds); -} - -static int replace_preds(struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_parse_state *ps, - char *filter_string, - bool dry_run) -{ - char *operand1 = NULL, *operand2 = NULL; - struct filter_pred *pred; - struct filter_pred *root; - struct postfix_elt *elt; - struct pred_stack stack = { }; /* init to NULL */ - int err; - int n_preds = 0; - - n_preds = count_preds(ps); - if (n_preds >= MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - err = check_preds(ps); - if (err) - return err; - - if (!dry_run) { - err = __alloc_pred_stack(&stack, n_preds); - if (err) - return err; - err = __alloc_preds(filter, n_preds); - if (err) - goto fail; - } - - n_preds = 0; - list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) { - if (!operand1) - operand1 = elt->operand; - else if (!operand2) - operand2 = elt->operand; - else { - parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - err = -EINVAL; - goto fail; - } - continue; - } - - if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - err = -ENOSPC; - goto fail; - } - - pred = create_pred(ps, call, elt->op, operand1, operand2); - if (!pred) { - err = -EINVAL; - goto fail; - } - - if (!dry_run) { - err = filter_add_pred(ps, filter, pred, &stack); - if (err) - goto fail; - } - - operand1 = operand2 = NULL; - } - - if (!dry_run) { - /* We should have one item left on the stack */ - pred = __pop_pred_stack(&stack); - if (!pred) - return -EINVAL; - /* This item is where we start from in matching */ - root = pred; - /* Make sure the stack is empty */ - pred = __pop_pred_stack(&stack); - if (WARN_ON(pred)) { - err = -EINVAL; - filter->root = NULL; - goto fail; - } - err = check_pred_tree(filter, root); - if (err) - goto fail; - - /* Optimize the tree */ - err = fold_pred_tree(filter, root); - if (err) - goto fail; - - /* We don't set root until we know it works */ - barrier(); - filter->root = root; - } - - err = 0; -fail: - __free_pred_stack(&stack); - return err; -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - -static int replace_system_preds(struct event_subsystem *system, - struct filter_parse_state *ps, - char *filter_string) -{ - struct ftrace_event_call *call; - struct filter_list *filter_item; - struct filter_list *tmp; - LIST_HEAD(filter_list); - bool fail = true; - int err; - - list_for_each_entry(call, &ftrace_events, list) { - - if (strcmp(call->class->system, system->name) != 0) - continue; - - /* - * Try to see if the filter can be applied - * (filter arg is ignored on dry_run) - */ - err = replace_preds(call, NULL, ps, filter_string, true); - if (err) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - } - - list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter; - - if (strcmp(call->class->system, system->name) != 0) - continue; - - if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) - continue; - - filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); - if (!filter_item) - goto fail_mem; - - list_add_tail(&filter_item->list, &filter_list); - - filter_item->filter = __alloc_filter(); - if (!filter_item->filter) - goto fail_mem; - filter = filter_item->filter; - - /* Can only fail on no memory */ - err = replace_filter_string(filter, filter_string); - if (err) - goto fail_mem; - - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; - /* - * Regardless of if this returned an error, we still - * replace the filter for the call. - */ - filter = call->filter; - rcu_assign_pointer(call->filter, filter_item->filter); - filter_item->filter = filter; - - fail = false; - } - - if (fail) - goto fail; - - /* - * The calls can still be using the old filters. - * Do a synchronize_sched() to ensure all calls are - * done with them before we free them. - */ - synchronize_sched(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } - return 0; - fail: - /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; - fail_mem: - /* If any call succeeded, we still need to sync */ - if (!fail) - synchronize_sched(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } - return -ENOMEM; -} - -static int create_filter_start(char *filter_str, bool set_str, - struct filter_parse_state **psp, - struct event_filter **filterp) -{ - struct event_filter *filter; - struct filter_parse_state *ps = NULL; - int err = 0; - - WARN_ON_ONCE(*psp || *filterp); - - /* allocate everything, and if any fails, free all and fail */ - filter = __alloc_filter(); - if (filter && set_str) - err = replace_filter_string(filter, filter_str); - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - - if (!filter || !ps || err) { - kfree(ps); - __free_filter(filter); - return -ENOMEM; - } - - /* we're committed to creating a new filter */ - *filterp = filter; - *psp = ps; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err && set_str) - append_filter_err(ps, filter); - return err; -} - -static void create_filter_finish(struct filter_parse_state *ps) -{ - if (ps) { - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - } -} - -/** - * create_filter - create a filter for a ftrace_event_call - * @call: ftrace_event_call to create a filter for - * @filter_str: filter string - * @set_str: remember @filter_str and enable detailed error in filter - * @filterp: out param for created filter (always updated on return) - * - * Creates a filter for @call with @filter_str. If @set_str is %true, - * @filter_str is copied and recorded in the new filter. - * - * On success, returns 0 and *@filterp points to the new filter. On - * failure, returns -errno and *@filterp may point to %NULL or to a new - * filter. In the latter case, the returned filter contains error - * information if @set_str is %true and the caller is responsible for - * freeing it. - */ -static int create_filter(struct ftrace_event_call *call, - char *filter_str, bool set_str, - struct event_filter **filterp) -{ - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; - int err; - - err = create_filter_start(filter_str, set_str, &ps, &filter); - if (!err) { - err = replace_preds(call, filter, ps, filter_str, false); - if (err && set_str) - append_filter_err(ps, filter); - } - create_filter_finish(ps); - - *filterp = filter; - return err; -} - -/** - * create_system_filter - create a filter for an event_subsystem - * @system: event_subsystem to create a filter for - * @filter_str: filter string - * @filterp: out param for created filter (always updated on return) - * - * Identical to create_filter() except that it creates a subsystem filter - * and always remembers @filter_str. - */ -static int create_system_filter(struct event_subsystem *system, - char *filter_str, struct event_filter **filterp) -{ - struct event_filter *filter = NULL; - struct filter_parse_state *ps = NULL; - int err; - - err = create_filter_start(filter_str, true, &ps, &filter); - if (!err) { - err = replace_system_preds(system, ps, filter_str); - if (!err) { - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - } else { - append_filter_err(ps, filter); - } - } - create_filter_finish(ps); - - *filterp = filter; - return err; -} - -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) -{ - struct event_filter *filter; - int err = 0; - - mutex_lock(&event_mutex); - - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(call); - filter = call->filter; - if (!filter) - goto out_unlock; - RCU_INIT_POINTER(call->filter, NULL); - /* Make sure the filter is not being used */ - synchronize_sched(); - __free_filter(filter); - goto out_unlock; - } - - err = create_filter(call, filter_string, true, &filter); - - /* - * Always swap the call filter with the new filter - * even if there was an error. If there was an error - * in the filter, we disable the filter and show the error - * string - */ - if (filter) { - struct event_filter *tmp = call->filter; - - if (!err) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - filter_disable(call); - - rcu_assign_pointer(call->filter, filter); - - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); - } - } -out_unlock: - mutex_unlock(&event_mutex); - - return err; -} - -int apply_subsystem_event_filter(struct event_subsystem *system, - char *filter_string) -{ - struct event_filter *filter; - int err = 0; - - mutex_lock(&event_mutex); - - /* Make sure the system still has events */ - if (!system->nr_events) { - err = -ENODEV; - goto out_unlock; - } - - if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); - remove_filter_string(system->filter); - filter = system->filter; - system->filter = NULL; - /* Ensure all filters are no longer used */ - synchronize_sched(); - filter_free_subsystem_filters(system); - __free_filter(filter); - goto out_unlock; - } - - err = create_system_filter(system, filter_string, &filter); - if (filter) { - /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). - */ - __free_filter(system->filter); - system->filter = filter; - } -out_unlock: - mutex_unlock(&event_mutex); - - return err; -} - -#ifdef CONFIG_PERF_EVENTS - -void ftrace_profile_free_filter(struct perf_event *event) -{ - struct event_filter *filter = event->filter; - - event->filter = NULL; - __free_filter(filter); -} - -int ftrace_profile_set_filter(struct perf_event *event, int event_id, - char *filter_str) -{ - int err; - struct event_filter *filter; - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - - call = event->tp_event; - - err = -EINVAL; - if (!call) - goto out_unlock; - - err = -EEXIST; - if (event->filter) - goto out_unlock; - - err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; - else - __free_filter(filter); - -out_unlock: - mutex_unlock(&event_mutex); - - return err; -} - -#endif /* CONFIG_PERF_EVENTS */ - -#ifdef CONFIG_FTRACE_STARTUP_TEST - -#include -#include - -#define CREATE_TRACE_POINTS -#include "trace_events_filter_test.h" - -#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ -{ \ - .filter = FILTER, \ - .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ - .e = ve, .f = vf, .g = vg, .h = vh }, \ - .match = m, \ - .not_visited = nvisit, \ -} -#define YES 1 -#define NO 0 - -static struct test_filter_data_t { - char *filter; - struct ftrace_raw_ftrace_test_filter rec; - int match; - char *not_visited; -} test_filter_data[] = { -#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ - "e == 1 && f == 1 && g == 1 && h == 1" - DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), - DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), - DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), -#undef FILTER -#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ - "e == 1 || f == 1 || g == 1 || h == 1" - DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), - DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), - DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), -#undef FILTER -#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ - "(e == 1 || f == 1) && (g == 1 || h == 1)" - DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), - DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), - DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), - DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), -#undef FILTER -#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ - "(e == 1 && f == 1) || (g == 1 && h == 1)" - DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), - DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), - DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), -#undef FILTER -#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ - "(e == 1 && f == 1) || (g == 1 && h == 1)" - DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), - DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), - DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), -#undef FILTER -#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ - "(e == 1 || f == 1)) && (g == 1 || h == 1)" - DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), - DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), - DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), -#undef FILTER -#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ - "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" - DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), - DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), - DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), -#undef FILTER -#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ - "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" - DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), - DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), - DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), -}; - -#undef DATA_REC -#undef FILTER -#undef YES -#undef NO - -#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) - -static int test_pred_visited; - -static int test_pred_visited_fn(struct filter_pred *pred, void *event) -{ - struct ftrace_event_field *field = pred->field; - - test_pred_visited = 1; - printk(KERN_INFO "\npred visited %s\n", field->name); - return 1; -} - -static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, - int *err, void *data) -{ - char *fields = data; - - if ((move == MOVE_DOWN) && - (pred->left == FILTER_PRED_INVALID)) { - struct ftrace_event_field *field = pred->field; - - if (!field) { - WARN(1, "all leafs should have field defined"); - return WALK_PRED_DEFAULT; - } - if (!strchr(fields, *field->name)) - return WALK_PRED_DEFAULT; - - WARN_ON(!pred->fn); - pred->fn = test_pred_visited_fn; - } - return WALK_PRED_DEFAULT; -} - -static __init int ftrace_test_event_filter(void) -{ - int i; - - printk(KERN_INFO "Testing ftrace filter: "); - - for (i = 0; i < DATA_CNT; i++) { - struct event_filter *filter = NULL; - struct test_filter_data_t *d = &test_filter_data[i]; - int err; - - err = create_filter(&event_ftrace_test_filter, d->filter, - false, &filter); - if (err) { - printk(KERN_INFO - "Failed to get filter for '%s', err %d\n", - d->filter, err); - __free_filter(filter); - break; - } - - /* - * The preemption disabling is not really needed for self - * tests, but the rcu dereference will complain without it. - */ - preempt_disable(); - if (*d->not_visited) - walk_pred_tree(filter->preds, filter->root, - test_walk_pred_cb, - d->not_visited); - - test_pred_visited = 0; - err = filter_match_preds(filter, &d->rec); - preempt_enable(); - - __free_filter(filter); - - if (test_pred_visited) { - printk(KERN_INFO - "Failed, unwanted pred visited for filter %s\n", - d->filter); - break; - } - - if (err != d->match) { - printk(KERN_INFO - "Failed to match filter '%s', expected %d\n", - d->filter, d->match); - break; - } - } - - if (i == DATA_CNT) - printk(KERN_CONT "OK\n"); - - return 0; -} - -late_initcall(ftrace_test_event_filter); - -#endif /* CONFIG_FTRACE_STARTUP_TEST */ -/* - * trace_export.c - export basic ftrace utilities to user space - * - * Copyright (C) 2009 Steven Rostedt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace_output.h" - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* not needed for this file */ -#undef __field_struct -#define __field_struct(type, item) - -#undef __field -#define __field(type, item) type item; - -#undef __field_desc -#define __field_desc(type, container, item) type item; - -#undef __array -#define __array(type, item, size) type item[size]; - -#undef __array_desc -#define __array_desc(type, container, item, size) type item[size]; - -#undef __dynamic_array -#define __dynamic_array(type, item) type item[]; - -#undef F_STRUCT -#define F_STRUCT(args...) args - -#undef F_printk -#define F_printk(fmt, args...) fmt, args - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ -} - -#undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) - -#include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - do { \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ - mutex_unlock(&event_storage_mutex); \ - if (ret) \ - return ret; \ - } while (0); - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), \ - container.item), \ - sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ - if (ret) \ - return ret; - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -int \ -ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ -{ \ - struct struct_name field; \ - int ret; \ - \ - tstruct; \ - \ - return ret; \ -} - -#include "trace_entries.h" - -#undef __entry -#define __entry REC - -#undef __field -#define __field(type, item) - -#undef __field_desc -#define __field_desc(type, container, item) - -#undef __array -#define __array(type, item, len) - -#undef __array_desc -#define __array_desc(type, container, item, len) - -#undef __dynamic_array -#define __dynamic_array(type, item) - -#undef F_printk -#define F_printk(fmt, args...) #fmt ", " __stringify(args) - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ - \ -struct ftrace_event_class event_class_ftrace_##call = { \ - .system = __stringify(TRACE_SYSTEM), \ - .define_fields = ftrace_define_fields_##call, \ - .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ -}; \ - \ -struct ftrace_event_call __used event_##call = { \ - .name = #call, \ - .event.type = etype, \ - .class = &event_class_ftrace_##call, \ - .print_fmt = print, \ -}; \ -struct ftrace_event_call __used \ -__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; - -#include "trace_entries.h" -/* - * ring buffer based function tracer - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Based on code from the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include - -#include "trace.h" - -/* function tracing enabled */ -static int ftrace_function_enabled; - -static struct trace_array *func_trace; - -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); - -static int function_trace_init(struct trace_array *tr) -{ - func_trace = tr; - tr->cpu = get_cpu(); - put_cpu(); - - tracing_start_cmdline_record(); - tracing_start_function_trace(); - return 0; -} - -static void function_trace_reset(struct trace_array *tr) -{ - tracing_stop_function_trace(); - tracing_stop_cmdline_record(); -} - -static void function_trace_start(struct trace_array *tr) -{ - tracing_reset_online_cpus(tr); -} - -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, ip, parent_ip, flags, pc); - } - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); - } - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; - -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - -static struct tracer_opt func_opts[] = { -#ifdef CONFIG_STACKTRACE - { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, -#endif - { } /* Always set a last empty entry */ -}; - -static struct tracer_flags func_flags = { - .val = 0, /* By default: all flags disabled */ - .opts = func_opts -}; - -static void tracing_start_function_trace(void) -{ - ftrace_function_enabled = 0; - - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - register_ftrace_function(&trace_stack_ops); - else - register_ftrace_function(&trace_ops); - - ftrace_function_enabled = 1; -} - -static void tracing_stop_function_trace(void) -{ - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - unregister_ftrace_function(&trace_stack_ops); - else - unregister_ftrace_function(&trace_ops); -} - -static int func_set_flag(u32 old_flags, u32 bit, int set) -{ - if (bit == TRACE_FUNC_OPT_STACK) { - /* do nothing if already set */ - if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; - - if (set) { - unregister_ftrace_function(&trace_ops); - register_ftrace_function(&trace_stack_ops); - } else { - unregister_ftrace_function(&trace_stack_ops); - register_ftrace_function(&trace_ops); - } - - return 0; - } - - return -EINVAL; -} - -static struct tracer function_trace __read_mostly = -{ - .name = "function", - .init = function_trace_init, - .reset = function_trace_reset, - .start = function_trace_start, - .wait_pipe = poll_wait_pipe, - .flags = &func_flags, - .set_flag = func_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_function, -#endif -}; - -#ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) -{ - long *count = (long *)data; - - if (tracing_is_on()) - return; - - if (!*count) - return; - - if (*count != -1) - (*count)--; - - tracing_on(); -} - -static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) -{ - long *count = (long *)data; - - if (!tracing_is_on()) - return; - - if (!*count) - return; - - if (*count != -1) - (*count)--; - - tracing_off(); -} - -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data); - -static struct ftrace_probe_ops traceon_probe_ops = { - .func = ftrace_traceon, - .print = ftrace_trace_onoff_print, -}; - -static struct ftrace_probe_ops traceoff_probe_ops = { - .func = ftrace_traceoff, - .print = ftrace_trace_onoff_print, -}; - -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) -{ - long count = (long)data; - - seq_printf(m, "%ps:", (void *)ip); - - if (ops == &traceon_probe_ops) - seq_printf(m, "traceon"); - else - seq_printf(m, "traceoff"); - - if (count == -1) - seq_printf(m, ":unlimited\n"); - else - seq_printf(m, ":count=%ld\n", count); - - return 0; -} - -static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) -{ - struct ftrace_probe_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; - - unregister_ftrace_function_probe_func(glob, ops); - - return 0; -} - -static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) -{ - struct ftrace_probe_ops *ops; - void *count = (void *)-1; - char *number; - int ret; - - /* hash funcs only work with set_ftrace_filter */ - if (!enable) - return -EINVAL; - - if (glob[0] == '!') - return ftrace_trace_onoff_unreg(glob+1, cmd, param); - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; - - if (!param) - goto out_reg; - - number = strsep(¶m, ":"); - - if (!strlen(number)) - goto out_reg; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = strict_strtoul(number, 0, (unsigned long *)&count); - if (ret) - return ret; - - out_reg: - ret = register_ftrace_function_probe(glob, ops, count); - - return ret < 0 ? ret : 0; -} - -static struct ftrace_func_command ftrace_traceon_cmd = { - .name = "traceon", - .func = ftrace_trace_onoff_callback, -}; - -static struct ftrace_func_command ftrace_traceoff_cmd = { - .name = "traceoff", - .func = ftrace_trace_onoff_callback, -}; - -static int __init init_func_cmd_traceon(void) -{ - int ret; - - ret = register_ftrace_command(&ftrace_traceoff_cmd); - if (ret) - return ret; - - ret = register_ftrace_command(&ftrace_traceon_cmd); - if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); - return ret; -} -#else -static inline int init_func_cmd_traceon(void) -{ - return 0; -} -#endif /* CONFIG_DYNAMIC_FTRACE */ - -static __init int init_function_trace(void) -{ - init_func_cmd_traceon(); - return register_tracer(&function_trace); -} -device_initcall(init_function_trace); - -/* - * - * Function graph tracer. - * Copyright (c) 2008-2009 Frederic Weisbecker - * Mostly borrowed from function tracer which - * is Copyright (c) Steven Rostedt - * - */ -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -/* When set, irq functions will be ignored */ -static int ftrace_graph_skip_irqs; - -struct fgraph_cpu_data { - pid_t last_pid; - int depth; - int depth_irq; - int ignore; - unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; -}; - -struct fgraph_data { - struct fgraph_cpu_data __percpu *cpu_data; - - /* Place to preserve last processed entry. */ - struct ftrace_graph_ent_entry ent; - struct ftrace_graph_ret_entry ret; - int failed; - int cpu; -}; - -#define TRACE_GRAPH_INDENT 2 - -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 - -static struct tracer_opt trace_opts[] = { - /* Display overruns? (for self-debug purpose) */ - { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, - /* Display CPU ? */ - { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, - /* Display Overhead ? */ - { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, - /* Display proc name/pid */ - { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, - /* Display duration of execution */ - { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, - /* Display absolute time of an entry */ - { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, - /* Display interrupts */ - { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ - .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, - .opts = trace_opts -}; - -static struct trace_array *graph_array; - -/* - * DURATION column is being also used to display IRQ signs, - * following values are used by print_graph_irq and others - * to fill in space into DURATION column. - */ -enum { - DURATION_FILL_FULL = -1, - DURATION_FILL_START = -2, - DURATION_FILL_END = -3, -}; - -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags); - -/* Add a function return address to the trace stack on thread info.*/ -int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) -{ - unsigned long long calltime; - int index; - - if (!current->ret_stack) - return -EBUSY; - - /* - * We must make sure the ret_stack is tested before we read - * anything else. - */ - smp_rmb(); - - /* The return trace stack is full */ - if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { - atomic_inc(¤t->trace_overrun); - return -EBUSY; - } - - calltime = trace_clock_local(); - - index = ++current->curr_ret_stack; - barrier(); - current->ret_stack[index].ret = ret; - current->ret_stack[index].func = func; - current->ret_stack[index].calltime = calltime; - current->ret_stack[index].subtime = 0; - current->ret_stack[index].fp = frame_pointer; - *depth = index; - - return 0; -} - -/* Retrieve a function return address to the trace stack on thread info.*/ -static void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, - unsigned long frame_pointer) -{ - int index; - - index = current->curr_ret_stack; - - if (unlikely(index < 0)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic, otherwise we have no where to go */ - *ret = (unsigned long)panic; - return; - } - -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST - /* - * The arch may choose to record the frame pointer used - * and check it here to make sure that it is what we expect it - * to be. If gcc does not set the place holder of the return - * address in the frame pointer, and does a copy instead, then - * the function graph trace will fail. This test detects this - * case. - * - * Currently, x86_32 with optimize for size (-Os) makes the latest - * gcc do the above. - */ - if (unlikely(current->ret_stack[index].fp != frame_pointer)) { - ftrace_graph_stop(); - WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %ps return to %lx\n", - current->ret_stack[index].fp, - frame_pointer, - (void *)current->ret_stack[index].func, - current->ret_stack[index].ret); - *ret = (unsigned long)panic; - return; - } -#endif - - *ret = current->ret_stack[index].ret; - trace->func = current->ret_stack[index].func; - trace->calltime = current->ret_stack[index].calltime; - trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = index; -} - -/* - * Send the trace to the ring-buffer. - * @return the original return address. - */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) -{ - struct ftrace_graph_ret trace; - unsigned long ret; - - ftrace_pop_return_trace(&trace, &ret, frame_pointer); - trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); - barrier(); - current->curr_ret_stack--; - - if (unlikely(!ret)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic. What else to do? */ - ret = (unsigned long)panic; - } - - return ret; -} - -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return 0; - - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); - - return 1; -} - -static inline int ftrace_graph_ignore_irqs(void) -{ - if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) - return 0; - - return in_irq(); -} - -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = graph_array; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - -static void -__trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) -{ - __trace_graph_function(tr, ip, flags, pc); -} - -void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = graph_array; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -void set_graph_array(struct trace_array *tr) -{ - graph_array = tr; - - /* Make graph_array visible before we start tracing */ - - smp_mb(); -} - -void trace_graph_thresh_return(struct ftrace_graph_ret *trace) -{ - if (tracing_thresh && - (trace->rettime - trace->calltime < tracing_thresh)) - return; - else - trace_graph_return(trace); -} - -static int graph_trace_init(struct trace_array *tr) -{ - int ret; - - set_graph_array(tr); - if (tracing_thresh) - ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); - else - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); - if (ret) - return ret; - tracing_start_cmdline_record(); - - return 0; -} - -static void graph_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - unregister_ftrace_graph(); -} - -static int max_bytes_for_cpu; - -static enum print_line_t -print_graph_cpu(struct trace_seq *s, int cpu) -{ - int ret; - - /* - * Start with a space character - to make it stand out - * to the right a bit when trace output is pasted into - * email: - */ - ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -#define TRACE_GRAPH_PROCINFO_LENGTH 14 - -static enum print_line_t -print_graph_proc(struct trace_seq *s, pid_t pid) -{ - char comm[TASK_COMM_LEN]; - /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; - int spaces = 0; - int ret; - int len; - int i; - - trace_find_cmdline(pid, comm); - comm[7] = '\0'; - sprintf(pid_str, "%d", pid); - - /* 1 stands for the "-" character */ - len = strlen(comm) + strlen(pid_str) + 1; - - if (len < TRACE_GRAPH_PROCINFO_LENGTH) - spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; - - /* First spaces to align center */ - for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = trace_seq_printf(s, "%s-%s", comm, pid_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Last spaces to align center */ - for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; -} - - -static enum print_line_t -print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) -{ - if (!trace_seq_putc(s, ' ')) - return 0; - - return trace_print_lat_fmt(s, entry); -} - -/* If the pid changed since the last trace, output this event */ -static enum print_line_t -verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) -{ - pid_t prev_pid; - pid_t *last_pid; - int ret; - - if (!data) - return TRACE_TYPE_HANDLED; - - last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); - - if (*last_pid == pid) - return TRACE_TYPE_HANDLED; - - prev_pid = *last_pid; - *last_pid = pid; - - if (prev_pid == -1) - return TRACE_TYPE_HANDLED; -/* - * Context-switch trace line: - - ------------------------------------------ - | 1) migration/0--1 => sshd-1755 - ------------------------------------------ - - */ - ret = trace_seq_printf(s, - " ------------------------------------------\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, prev_pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " => "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, - "\n ------------------------------------------\n\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static struct ftrace_graph_ret_entry * -get_return_for_leaf(struct trace_iterator *iter, - struct ftrace_graph_ent_entry *curr) -{ - struct fgraph_data *data = iter->private; - struct ring_buffer_iter *ring_iter = NULL; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *next; - - /* - * If the previous output failed to write to the seq buffer, - * then we just reuse the data from before. - */ - if (data && data->failed) { - curr = &data->ent; - next = &data->ret; - } else { - - ring_iter = iter->buffer_iter[iter->cpu]; - - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* - * We need to consume the current entry to see - * the next one. - */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, - NULL, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL, NULL); - } - - if (!event) - return NULL; - - next = ring_buffer_event_data(event); - - if (data) { - /* - * Save current and next entries for later reference - * if the output fails. - */ - data->ent = *curr; - /* - * If the next event is not a return type, then - * we only care about what type it is. Otherwise we can - * safely copy the entire event. - */ - if (next->ent.type == TRACE_GRAPH_RET) - data->ret = *next; - else - data->ret.ent.type = next->ent.type; - } - } - - if (next->ent.type != TRACE_GRAPH_RET) - return NULL; - - if (curr->ent.pid != next->ent.pid || - curr->graph_ent.func != next->ret.func) - return NULL; - - /* this is a leaf, now advance the iterator */ - if (ring_iter) - ring_buffer_read(ring_iter, NULL); - - return next; -} - -static int print_graph_abs_time(u64 t, struct trace_seq *s) -{ - unsigned long usecs_rem; - - usecs_rem = do_div(t, NSEC_PER_SEC); - usecs_rem /= 1000; - - return trace_seq_printf(s, "%5lu.%06lu | ", - (unsigned long)t, usecs_rem); -} - -static enum print_line_t -print_graph_irq(struct trace_iterator *iter, unsigned long addr, - enum trace_type type, int cpu, pid_t pid, u32 flags) -{ - int ret; - struct trace_seq *s = &iter->seq; - - if (addr < (unsigned long)__irqentry_text_start || - addr >= (unsigned long)__irqentry_text_end) - return TRACE_TYPE_UNHANDLED; - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - } - - /* No overhead */ - ret = print_graph_duration(DURATION_FILL_START, s, flags); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - if (type == TRACE_GRAPH_ENT) - ret = trace_seq_printf(s, "==========>"); - else - ret = trace_seq_printf(s, "<=========="); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(DURATION_FILL_END, s, flags); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_printf(s, "\n"); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -enum print_line_t -trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) -{ - unsigned long nsecs_rem = do_div(duration, 1000); - /* log10(ULONG_MAX) + '\0' */ - char msecs_str[21]; - char nsecs_str[5]; - int ret, len; - int i; - - sprintf(msecs_str, "%lu", (unsigned long) duration); - - /* Print msecs */ - ret = trace_seq_printf(s, "%s", msecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - len = strlen(msecs_str); - - /* Print nsecs (we don't want to exceed 7 numbers) */ - if (len < 7) { - size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); - - snprintf(nsecs_str, slen, "%03lu", nsecs_rem); - ret = trace_seq_printf(s, ".%s", nsecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - len += strlen(nsecs_str); - } - - ret = trace_seq_printf(s, " us "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags) -{ - int ret = -1; - - if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return TRACE_TYPE_HANDLED; - - /* No real adata, just filling the column with spaces */ - switch (duration) { - case DURATION_FILL_FULL: - ret = trace_seq_printf(s, " | "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_START: - ret = trace_seq_printf(s, " "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_END: - ret = trace_seq_printf(s, " |"); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - } - - /* Signal a overhead of time execution to the output */ - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - ret = trace_seq_printf(s, "! "); - /* Duration exceeded 10 msecs */ - else if (duration > 10000ULL) - ret = trace_seq_printf(s, "+ "); - } - - /* - * The -1 means we either did not exceed the duration tresholds - * or we dont want to print out the overhead. Either way we need - * to fill out the space. - */ - if (ret == -1) - ret = trace_seq_printf(s, " "); - - /* Catching here any failure happenned above */ - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_print_graph_duration(duration, s); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_printf(s, "| "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -/* Case of a leaf function on its call entry */ -static enum print_line_t -print_graph_entry_leaf(struct trace_iterator *iter, - struct ftrace_graph_ent_entry *entry, - struct ftrace_graph_ret_entry *ret_entry, - struct trace_seq *s, u32 flags) -{ - struct fgraph_data *data = iter->private; - struct ftrace_graph_ret *graph_ret; - struct ftrace_graph_ent *call; - unsigned long long duration; - int ret; - int i; - - graph_ret = &ret_entry->ret; - call = &entry->graph_ent; - duration = graph_ret->rettime - graph_ret->calltime; - - if (data) { - struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; - - cpu_data = per_cpu_ptr(data->cpu_data, cpu); - - /* - * Comments display at + 1 to depth. Since - * this is a leaf function, keep the comments - * equal to this depth. - */ - cpu_data->depth = call->depth - 1; - - /* No need to keep this function around for this depth */ - if (call->depth < FTRACE_RETFUNC_DEPTH) - cpu_data->enter_funcs[call->depth] = 0; - } - - /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -print_graph_entry_nested(struct trace_iterator *iter, - struct ftrace_graph_ent_entry *entry, - struct trace_seq *s, int cpu, u32 flags) -{ - struct ftrace_graph_ent *call = &entry->graph_ent; - struct fgraph_data *data = iter->private; - int ret; - int i; - - if (data) { - struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; - - cpu_data = per_cpu_ptr(data->cpu_data, cpu); - cpu_data->depth = call->depth; - - /* Save this function pointer to see if the exit matches */ - if (call->depth < FTRACE_RETFUNC_DEPTH) - cpu_data->enter_funcs[call->depth] = call->func; - } - - /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* - * we already consumed the current entry to check the next one - * and see if this is a leaf. - */ - return TRACE_TYPE_NO_CONSUME; -} - -static enum print_line_t -print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, - int type, unsigned long addr, u32 flags) -{ - struct fgraph_data *data = iter->private; - struct trace_entry *ent = iter->ent; - int cpu = iter->cpu; - int ret; - - /* Pid */ - if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - if (type) { - /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return 0; - - /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Proc */ - if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - ret = print_graph_lat_fmt(s, ent); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - return 0; -} - -/* - * Entry check for irq code - * - * returns 1 if - * - we are inside irq code - * - we just entered irq code - * - * retunns 0 if - * - funcgraph-interrupts option is set - * - we are not inside irq code - */ -static int -check_irq_entry(struct trace_iterator *iter, u32 flags, - unsigned long addr, int depth) -{ - int cpu = iter->cpu; - int *depth_irq; - struct fgraph_data *data = iter->private; - - /* - * If we are either displaying irqs, or we got called as - * a graph event and private data does not exist, - * then we bypass the irq check. - */ - if ((flags & TRACE_GRAPH_PRINT_IRQS) || - (!data)) - return 0; - - depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - - /* - * We are inside the irq code - */ - if (*depth_irq >= 0) - return 1; - - if ((addr < (unsigned long)__irqentry_text_start) || - (addr >= (unsigned long)__irqentry_text_end)) - return 0; - - /* - * We are entering irq code. - */ - *depth_irq = depth; - return 1; -} - -/* - * Return check for irq code - * - * returns 1 if - * - we are inside irq code - * - we just left irq code - * - * returns 0 if - * - funcgraph-interrupts option is set - * - we are not inside irq code - */ -static int -check_irq_return(struct trace_iterator *iter, u32 flags, int depth) -{ - int cpu = iter->cpu; - int *depth_irq; - struct fgraph_data *data = iter->private; - - /* - * If we are either displaying irqs, or we got called as - * a graph event and private data does not exist, - * then we bypass the irq check. - */ - if ((flags & TRACE_GRAPH_PRINT_IRQS) || - (!data)) - return 0; - - depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - - /* - * We are not inside the irq code. - */ - if (*depth_irq == -1) - return 0; - - /* - * We are inside the irq code, and this is returning entry. - * Let's not trace it and clear the entry depth, since - * we are out of irq code. - * - * This condition ensures that we 'leave the irq code' once - * we are out of the entry depth. Thus protecting us from - * the RETURN entry loss. - */ - if (*depth_irq >= depth) { - *depth_irq = -1; - return 1; - } - - /* - * We are inside the irq code, and this is not the entry. - */ - return 1; -} - -static enum print_line_t -print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter, u32 flags) -{ - struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; - struct ftrace_graph_ret_entry *leaf_ret; - static enum print_line_t ret; - int cpu = iter->cpu; - - if (check_irq_entry(iter, flags, call->func, call->depth)) - return TRACE_TYPE_HANDLED; - - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) - return TRACE_TYPE_PARTIAL_LINE; - - leaf_ret = get_return_for_leaf(iter, field); - if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); - else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); - - if (data) { - /* - * If we failed to write our output, then we need to make - * note of it. Because we already consumed our entry. - */ - if (s->full) { - data->failed = 1; - data->cpu = cpu; - } else - data->failed = 0; - } - - return ret; -} - -static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, struct trace_iterator *iter, - u32 flags) -{ - unsigned long long duration = trace->rettime - trace->calltime; - struct fgraph_data *data = iter->private; - pid_t pid = ent->pid; - int cpu = iter->cpu; - int func_match = 1; - int ret; - int i; - - if (check_irq_return(iter, flags, trace->depth)) - return TRACE_TYPE_HANDLED; - - if (data) { - struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; - - cpu_data = per_cpu_ptr(data->cpu_data, cpu); - - /* - * Comments display at + 1 to depth. This is the - * return from a function, we now want the comments - * to display at the same level of the bracket. - */ - cpu_data->depth = trace->depth - 1; - - if (trace->depth < FTRACE_RETFUNC_DEPTH) { - if (cpu_data->enter_funcs[trace->depth] != trace->func) - func_match = 0; - cpu_data->enter_funcs[trace->depth] = 0; - } - } - - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; - - /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - /* Closing brace */ - for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. - */ - if (func_match) { - ret = trace_seq_printf(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } else { - ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Overrun */ - if (flags & TRACE_GRAPH_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)\n", - trace->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, - cpu, pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -print_graph_comment(struct trace_seq *s, struct trace_entry *ent, - struct trace_iterator *iter, u32 flags) -{ - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct fgraph_data *data = iter->private; - struct trace_event *event; - int depth = 0; - int ret; - int i; - - if (data) - depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; - - /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - /* Indentation */ - if (depth > 0) - for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* The comment */ - ret = trace_seq_printf(s, "/* "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (iter->ent->type) { - case TRACE_BPRINT: - ret = trace_print_bprintk_msg_only(iter); - if (ret != TRACE_TYPE_HANDLED) - return ret; - break; - case TRACE_PRINT: - ret = trace_print_printk_msg_only(iter); - if (ret != TRACE_TYPE_HANDLED) - return ret; - break; - default: - event = ftrace_find_event(ent->type); - if (!event) - return TRACE_TYPE_UNHANDLED; - - ret = event->funcs->trace(iter, sym_flags, event); - if (ret != TRACE_TYPE_HANDLED) - return ret; - } - - /* Strip ending newline */ - if (s->buffer[s->len - 1] == '\n') { - s->buffer[s->len - 1] = '\0'; - s->len--; - } - - ret = trace_seq_printf(s, " */\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - - -enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) -{ - struct ftrace_graph_ent_entry *field; - struct fgraph_data *data = iter->private; - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - int cpu = iter->cpu; - int ret; - - if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { - per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; - return TRACE_TYPE_HANDLED; - } - - /* - * If the last output failed, there's a possibility we need - * to print out the missing entry which would never go out. - */ - if (data && data->failed) { - field = &data->ent; - iter->cpu = data->cpu; - ret = print_graph_entry(field, s, iter, flags); - if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { - per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; - ret = TRACE_TYPE_NO_CONSUME; - } - iter->cpu = cpu; - return ret; - } - - switch (entry->type) { - case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; - trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); - } - case TRACE_GRAPH_RET: { - struct ftrace_graph_ret_entry *field; - trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter, flags); - } - case TRACE_STACK: - case TRACE_FN: - /* dont trace stack and functions as comments */ - return TRACE_TYPE_UNHANDLED; - - default: - return print_graph_comment(s, entry, iter, flags); - } - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -print_graph_function(struct trace_iterator *iter) -{ - return print_graph_function_flags(iter, tracer_flags.val); -} - -static enum print_line_t -print_graph_function_event(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return print_graph_function(iter); -} - -static void print_lat_header(struct seq_file *s, u32 flags) -{ - static const char spaces[] = " " /* 16 spaces */ - " " /* 4 spaces */ - " "; /* 17 spaces */ - int size = 0; - - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - size += 16; - if (flags & TRACE_GRAPH_PRINT_CPU) - size += 4; - if (flags & TRACE_GRAPH_PRINT_PROC) - size += 17; - - seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); - seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); - seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); - seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / \n", size, spaces); -} - -static void __print_graph_headers_flags(struct seq_file *s, u32 flags) -{ - int lat = trace_flags & TRACE_ITER_LATENCY_FMT; - - if (lat) - print_lat_header(s, flags); - - /* 1st line */ - seq_printf(s, "#"); - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " TIME "); - if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " CPU"); - if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); - if (lat) - seq_printf(s, "||||"); - if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " DURATION "); - seq_printf(s, " FUNCTION CALLS\n"); - - /* 2nd line */ - seq_printf(s, "#"); - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " | "); - if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " | "); - if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); - if (lat) - seq_printf(s, "||||"); - if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " | | "); - seq_printf(s, " | | | |\n"); -} - -void print_graph_headers(struct seq_file *s) -{ - print_graph_headers_flags(s, tracer_flags.val); -} - -void print_graph_headers_flags(struct seq_file *s, u32 flags) -{ - struct trace_iterator *iter = s->private; - - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - } - - __print_graph_headers_flags(s, flags); -} - -void graph_trace_open(struct trace_iterator *iter) -{ - /* pid and depth on the last trace processed */ - struct fgraph_data *data; - int cpu; - - iter->private = NULL; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out_err; - - data->cpu_data = alloc_percpu(struct fgraph_cpu_data); - if (!data->cpu_data) - goto out_err_free; - - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); - int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - - *pid = -1; - *depth = 0; - *ignore = 0; - *depth_irq = -1; - } - - iter->private = data; - - return; - - out_err_free: - kfree(data); - out_err: - pr_warning("function graph tracer: not enough memory\n"); -} - -void graph_trace_close(struct trace_iterator *iter) -{ - struct fgraph_data *data = iter->private; - - if (data) { - free_percpu(data->cpu_data); - kfree(data); - } -} - -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) -{ - if (bit == TRACE_GRAPH_PRINT_IRQS) - ftrace_graph_skip_irqs = !set; - - return 0; -} - -static struct trace_event_functions graph_functions = { - .trace = print_graph_function_event, -}; - -static struct trace_event graph_trace_entry_event = { - .type = TRACE_GRAPH_ENT, - .funcs = &graph_functions, -}; - -static struct trace_event graph_trace_ret_event = { - .type = TRACE_GRAPH_RET, - .funcs = &graph_functions -}; - -static struct tracer graph_trace __read_mostly = { - .name = "function_graph", - .open = graph_trace_open, - .pipe_open = graph_trace_open, - .close = graph_trace_close, - .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, - .init = graph_trace_init, - .reset = graph_trace_reset, - .print_line = print_graph_function, - .print_header = print_graph_headers, - .flags = &tracer_flags, - .set_flag = func_graph_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_function_graph, -#endif -}; - -static __init int init_graph_trace(void) -{ - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); - - if (!register_ftrace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - - if (!register_ftrace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - - return register_tracer(&graph_trace); -} - -device_initcall(init_graph_trace); -/* - * trace irqs off critical timings - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * From code in the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *irqsoff_trace __read_mostly; -static int tracer_enabled __read_mostly; - -static DEFINE_PER_CPU(int, tracing_cpu); - -static DEFINE_RAW_SPINLOCK(max_trace_lock); - -enum { - TRACER_IRQS_OFF = (1 << 1), - TRACER_PREEMPT_OFF = (1 << 2), -}; - -static int trace_type __read_mostly; - -static int save_lat_flag; - -static void stop_irqsoff_tracer(struct trace_array *tr, int graph); -static int start_irqsoff_tracer(struct trace_array *tr, int graph); - -#ifdef CONFIG_PREEMPT_TRACER -static inline int -preempt_trace(void) -{ - return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); -} -#else -# define preempt_trace() (0) -#endif - -#ifdef CONFIG_IRQSOFF_TRACER -static inline int -irq_trace(void) -{ - return ((trace_type & TRACER_IRQS_OFF) && - irqs_disabled()); -} -#else -# define irq_trace() (0) -#endif - -#define TRACE_DISPLAY_GRAPH 1 - -static struct tracer_opt trace_opts[] = { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, -#endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) - -/* - * Sequence count - we record it when starting a measurement and - * skip the latency if the sequence has changed - some other section - * did a maximum and could disturb our measurement with serial console - * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesn't - * decrease the validity of the maximum found: - */ -static __cacheline_aligned_in_smp unsigned long max_sequence; - -#ifdef CONFIG_FUNCTION_TRACER -/* - * Prologue for the preempt and irqs off function tracers. - * - * Returns 1 if it is OK to continue, and data->disabled is - * incremented. - * 0 if the trace is to be ignored, and data->disabled - * is kept the same. - * - * Note, this function is also used outside this ifdef but - * inside the #ifdef of the function graph tracer below. - * This is OK, since the function graph tracer is - * dependent on the function tracer. - */ -static int func_prolog_dec(struct trace_array *tr, - struct trace_array_cpu **data, - unsigned long *flags) -{ - long disabled; - int cpu; - - /* - * Does not matter if we preempt. We test the flags - * afterward, to see if irqs are disabled or not. - * If we preempt and get a false positive, the flags - * test will fail. - */ - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return 0; - - local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) - return 0; - - *data = tr->data[cpu]; - disabled = atomic_inc_return(&(*data)->disabled); - - if (likely(disabled == 1)) - return 1; - - atomic_dec(&(*data)->disabled); - - return 0; -} - -/* - * irqsoff uses its own tracer function to keep the overhead down: - */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - - if (!func_prolog_dec(tr, &data, &flags)) - return; - - trace_function(tr, ip, parent_ip, flags, preempt_count()); - - atomic_dec(&data->disabled); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) -{ - int cpu; - - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) - return 0; - - stop_irqsoff_tracer(irqsoff_trace, !set); - - for_each_possible_cpu(cpu) - per_cpu(tracing_cpu, cpu) = 0; - - tracing_max_latency = 0; - tracing_reset_online_cpus(irqsoff_trace); - - return start_irqsoff_tracer(irqsoff_trace, set); -} - -static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - int ret; - int pc; - - if (!func_prolog_dec(tr, &data, &flags)) - return 0; - - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - atomic_dec(&data->disabled); - - return ret; -} - -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc; - - if (!func_prolog_dec(tr, &data, &flags)) - return; - - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - atomic_dec(&data->disabled); -} - -static void irqsoff_trace_open(struct trace_iterator *iter) -{ - if (is_graph()) - graph_trace_open(iter); - -} - -static void irqsoff_trace_close(struct trace_iterator *iter) -{ - if (iter->private) - graph_trace_close(iter); -} - -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ - TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ - TRACE_GRAPH_PRINT_DURATION) - -static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) -{ - /* - * In graph mode call the graph tracer output function, - * otherwise go with the TRACE_FN event handler - */ - if (is_graph()) - return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); - - return TRACE_TYPE_UNHANDLED; -} - -static void irqsoff_print_header(struct seq_file *s) -{ - if (is_graph()) - print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); - else - trace_default_header(s); -} - -static void -__trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) -{ - if (is_graph()) - trace_graph_function(tr, ip, parent_ip, flags, pc); - else - trace_function(tr, ip, parent_ip, flags, pc); -} - -#else -#define __trace_function trace_function - -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - -static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} - -static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) -{ - return TRACE_TYPE_UNHANDLED; -} - -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_trace_open(struct trace_iterator *iter) { } -static void irqsoff_trace_close(struct trace_iterator *iter) { } - -#ifdef CONFIG_FUNCTION_TRACER -static void irqsoff_print_header(struct seq_file *s) -{ - trace_default_header(s); -} -#else -static void irqsoff_print_header(struct seq_file *s) -{ - trace_latency_header(s); -} -#endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -/* - * Should this new latency be reported/recorded? - */ -static int report_latency(cycle_t delta) -{ - if (tracing_thresh) { - if (delta < tracing_thresh) - return 0; - } else { - if (delta <= tracing_max_latency) - return 0; - } - return 1; -} - -static void -check_critical_timing(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long parent_ip, - int cpu) -{ - cycle_t T0, T1, delta; - unsigned long flags; - int pc; - - T0 = data->preempt_timestamp; - T1 = ftrace_now(cpu); - delta = T1-T0; - - local_save_flags(flags); - - pc = preempt_count(); - - if (!report_latency(delta)) - goto out; - - raw_spin_lock_irqsave(&max_trace_lock, flags); - - /* check if we are still the max latency */ - if (!report_latency(delta)) - goto out_unlock; - - __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - /* Skip 5 functions to get to the irq/preempt enable function */ - __trace_stack(tr, flags, 5, pc); - - if (data->critical_sequence != max_sequence) - goto out_unlock; - - data->critical_end = parent_ip; - - if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; - update_max_tr_single(tr, current, cpu); - } - - max_sequence++; - -out_unlock: - raw_spin_unlock_irqrestore(&max_trace_lock, flags); - -out: - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); -} - -static inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip) -{ - int cpu; - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - - if (likely(!tracer_enabled)) - return; - - cpu = raw_smp_processor_id(); - - if (per_cpu(tracing_cpu, cpu)) - return; - - data = tr->data[cpu]; - - if (unlikely(!data) || atomic_read(&data->disabled)) - return; - - atomic_inc(&data->disabled); - - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; - - local_save_flags(flags); - - __trace_function(tr, ip, parent_ip, flags, preempt_count()); - - per_cpu(tracing_cpu, cpu) = 1; - - atomic_dec(&data->disabled); -} - -static inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip) -{ - int cpu; - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; - - cpu = raw_smp_processor_id(); - /* Always clear the tracing cpu on stopping the trace */ - if (unlikely(per_cpu(tracing_cpu, cpu))) - per_cpu(tracing_cpu, cpu) = 0; - else - return; - - if (!tracer_enabled) - return; - - data = tr->data[cpu]; - - if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) - return; - - atomic_inc(&data->disabled); - - local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); -} - -/* start and stop critical timings used to for stoppage (in idle) */ -void start_critical_timings(void) -{ - if (preempt_trace() || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(start_critical_timings); - -void stop_critical_timings(void) -{ - if (preempt_trace() || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(stop_critical_timings); - -#ifdef CONFIG_IRQSOFF_TRACER -#ifdef CONFIG_PROVE_LOCKING -void time_hardirqs_on(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); -} - -void time_hardirqs_off(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); -} - -#else /* !CONFIG_PROVE_LOCKING */ - -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - -/* - * We are only interested in hardirq on/off events: - */ -void trace_hardirqs_on(void) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -void trace_hardirqs_off(void) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -#endif /* CONFIG_PROVE_LOCKING */ -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - stop_critical_timing(a0, a1); -} - -void trace_preempt_off(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - start_critical_timing(a0, a1); -} -#endif /* CONFIG_PREEMPT_TRACER */ - -static int start_irqsoff_tracer(struct trace_array *tr, int graph) -{ - int ret = 0; - - if (!graph) - ret = register_ftrace_function(&trace_ops); - else - ret = register_ftrace_graph(&irqsoff_graph_return, - &irqsoff_graph_entry); - - if (!ret && tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; - - return ret; -} - -static void stop_irqsoff_tracer(struct trace_array *tr, int graph) -{ - tracer_enabled = 0; - - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); -} - -static void __irqsoff_tracer_init(struct trace_array *tr) -{ - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; - - tracing_max_latency = 0; - irqsoff_trace = tr; - /* make sure that the tracer is visible */ - smp_wmb(); - tracing_reset_online_cpus(tr); - - if (start_irqsoff_tracer(tr, is_graph())) - printk(KERN_ERR "failed to start irqsoff tracer\n"); -} - -static void irqsoff_tracer_reset(struct trace_array *tr) -{ - stop_irqsoff_tracer(tr, is_graph()); - - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; -} - -static void irqsoff_tracer_start(struct trace_array *tr) -{ - tracer_enabled = 1; -} - -static void irqsoff_tracer_stop(struct trace_array *tr) -{ - tracer_enabled = 0; -} - -#ifdef CONFIG_IRQSOFF_TRACER -static int irqsoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_IRQS_OFF; - - __irqsoff_tracer_init(tr); - return 0; -} -static struct tracer irqsoff_tracer __read_mostly = -{ - .name = "irqsoff", - .init = irqsoff_tracer_init, - .reset = irqsoff_tracer_reset, - .start = irqsoff_tracer_start, - .stop = irqsoff_tracer_stop, - .print_max = 1, - .print_header = irqsoff_print_header, - .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_irqsoff, -#endif - .open = irqsoff_trace_open, - .close = irqsoff_trace_close, - .use_max_tr = 1, -}; -# define register_irqsoff(trace) register_tracer(&trace) -#else -# define register_irqsoff(trace) do { } while (0) -#endif - -#ifdef CONFIG_PREEMPT_TRACER -static int preemptoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_PREEMPT_OFF; - - __irqsoff_tracer_init(tr); - return 0; -} - -static struct tracer preemptoff_tracer __read_mostly = -{ - .name = "preemptoff", - .init = preemptoff_tracer_init, - .reset = irqsoff_tracer_reset, - .start = irqsoff_tracer_start, - .stop = irqsoff_tracer_stop, - .print_max = 1, - .print_header = irqsoff_print_header, - .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_preemptoff, -#endif - .open = irqsoff_trace_open, - .close = irqsoff_trace_close, - .use_max_tr = 1, -}; -# define register_preemptoff(trace) register_tracer(&trace) -#else -# define register_preemptoff(trace) do { } while (0) -#endif - -#if defined(CONFIG_IRQSOFF_TRACER) && \ - defined(CONFIG_PREEMPT_TRACER) - -static int preemptirqsoff_tracer_init(struct trace_array *tr) -{ - trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - - __irqsoff_tracer_init(tr); - return 0; -} - -static struct tracer preemptirqsoff_tracer __read_mostly = -{ - .name = "preemptirqsoff", - .init = preemptirqsoff_tracer_init, - .reset = irqsoff_tracer_reset, - .start = irqsoff_tracer_start, - .stop = irqsoff_tracer_stop, - .print_max = 1, - .print_header = irqsoff_print_header, - .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_preemptirqsoff, -#endif - .open = irqsoff_trace_open, - .close = irqsoff_trace_close, - .use_max_tr = 1, -}; - -# define register_preemptirqsoff(trace) register_tracer(&trace) -#else -# define register_preemptirqsoff(trace) do { } while (0) -#endif - -__init static int init_irqsoff_tracer(void) -{ - register_irqsoff(irqsoff_tracer); - register_preemptoff(preemptoff_tracer); - register_preemptirqsoff(preemptirqsoff_tracer); - - return 0; -} -device_initcall(init_irqsoff_tracer); -/* - * kdb helper for dumping the ftrace buffer - * - * Copyright (C) 2010 Jason Wessel - * - * ftrace_dump_buf based on ftrace_dump: - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - */ -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static void ftrace_dump_buf(int skip_lines, long cpu_file) -{ - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - unsigned int old_userobj; - int cnt = 0, cpu; - - trace_init_global_iter(&iter); - - for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); - } - - old_userobj = trace_flags; - - /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - - kdb_printf("Dumping ftrace buffer:\n"); - - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); - iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; - - if (cpu_file == TRACE_PIPE_ALL_CPU) { - for_each_tracing_cpu(cpu) { - iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.tr->buffer, cpu); - ring_buffer_read_start(iter.buffer_iter[cpu]); - tracing_iter_reset(&iter, cpu); - } - } else { - iter.cpu_file = cpu_file; - iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.tr->buffer, cpu_file); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); - tracing_iter_reset(&iter, cpu_file); - } - if (!trace_empty(&iter)) - trace_find_next_entry_inc(&iter); - while (!trace_empty(&iter)) { - if (!cnt) - kdb_printf("---------------------------------\n"); - cnt++; - - if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) - print_trace_line(&iter); - if (!skip_lines) - trace_printk_seq(&iter.seq); - else - skip_lines--; - if (KDB_FLAG(CMD_INTERRUPT)) - goto out; - } - - if (!cnt) - kdb_printf(" (ftrace buffer empty)\n"); - else - kdb_printf("---------------------------------\n"); - -out: - trace_flags = old_userobj; - - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - - for_each_tracing_cpu(cpu) - if (iter.buffer_iter[cpu]) - ring_buffer_read_finish(iter.buffer_iter[cpu]); -} - -/* - * kdb_ftdump - Dump the ftrace log buffer - */ -static int kdb_ftdump(int argc, const char **argv) -{ - int skip_lines = 0; - long cpu_file; - char *cp; - - if (argc > 2) - return KDB_ARGCOUNT; - - if (argc) { - skip_lines = simple_strtol(argv[1], &cp, 0); - if (*cp) - skip_lines = 0; - } - - if (argc == 2) { - cpu_file = simple_strtol(argv[2], &cp, 0); - if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || - !cpu_online(cpu_file)) - return KDB_BADINT; - } else { - cpu_file = TRACE_PIPE_ALL_CPU; - } - - kdb_trap_printk++; - ftrace_dump_buf(skip_lines, cpu_file); - kdb_trap_printk--; - - return 0; -} - -static __init int kdb_ftrace_register(void) -{ - kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_REPEAT_NONE); - return 0; -} - -late_initcall(kdb_ftrace_register); -/* - * Kprobes-based tracing events - * - * Created by Masami Hiramatsu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -#define MAX_TRACE_ARGS 128 -#define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 -#define MAX_STRING_SIZE PATH_MAX -#define KPROBE_EVENT_SYSTEM "kprobes" - -/* Reserved field names */ -#define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_RETIP "__probe_ret_ip" -#define FIELD_STRING_FUNC "__probe_func" - -const char *reserved_field_names[] = { - "common_type", - "common_flags", - "common_preempt_count", - "common_pid", - "common_tgid", - FIELD_STRING_IP, - FIELD_STRING_RETIP, - FIELD_STRING_FUNC, -}; - -/* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, - void *); -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - -/* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent)\ -{ \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ -} \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -static inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} - -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) -{ - return (u8 *)ent + get_rloc_offs(*dl); -} - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; - -/* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) -{ - int len = *(u32 *)data >> 16; - - if (!len) - return trace_seq_printf(s, " %s=(fault)", name); - else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); -} -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; - -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); - -struct fetch_param { - fetch_func_t fn; - void *data; -}; - -static __kprobes void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) - -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - long ret; - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); - if (!maxlen) - return; - /* - * Try to get string again, since the string can be changed while - * probing. - */ - set_fs(KERNEL_DS); - pagefault_disable(); - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); -} -/* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - int ret, len = 0; - u8 c; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - if (sc->addr) - sc->addr += sc->offset; - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - - update_symbol_cache(sc); - return sc; -} - -#define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; -}; - -#define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) - -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static __kprobes void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_END, -}; - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - -/* Fetch type information table */ -static const struct fetch_type { - const char *name; /* Name of type */ - size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ - print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ - const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -} fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) -{ - int i; - - if (!type) - type = DEFAULT_FETCH_TYPE_STR; - - /* Special case: bitfield */ - if (*type == 'b') { - unsigned long bs; - type = strchr(type, '/'); - if (!type) - goto fail; - type++; - if (strict_strtoul(type, 0, &bs)) - goto fail; - switch (bs) { - case 8: - return find_fetch_type("u8"); - case 16: - return find_fetch_type("u16"); - case 32: - return find_fetch_type("u32"); - case 64: - return find_fetch_type("u64"); - default: - goto fail; - } - } - - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; -fail: - return NULL; -} - -/* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) -{ - int i; - - if (type != &fetch_type_table[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - return NULL; -} - -/** - * Kprobe event core functions - */ - -struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; - unsigned int offset; /* Offset from argument entry */ - const char *name; /* Name of this argument */ - const char *comm; /* Command of this argument */ - const struct fetch_type *type; /* Type of this argument */ -}; - -/* Flags for trace_probe */ -#define TP_FLAG_TRACE 1 -#define TP_FLAG_PROFILE 2 -#define TP_FLAG_REGISTERED 4 - -struct trace_probe { - struct list_head list; - struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ - const char *symbol; /* symbol name */ - struct ftrace_event_class class; - struct ftrace_event_call call; - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; -}; - -#define SIZEOF_TRACE_PROBE(n) \ - (offsetof(struct trace_probe, args) + \ - (sizeof(struct probe_arg) * (n))) - - -static __kprobes int trace_probe_is_return(struct trace_probe *tp) -{ - return tp->rp.handler != NULL; -} - -static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) -{ - return tp->symbol ? tp->symbol : "unknown"; -} - -static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) -{ - return tp->rp.kp.offset; -} - -static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) -{ - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); -} - -static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) -{ - return !!(tp->flags & TP_FLAG_REGISTERED); -} - -static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) -{ - return !!(kprobe_gone(&tp->rp.kp)); -} - -static __kprobes bool trace_probe_within_module(struct trace_probe *tp, - struct module *mod) -{ - int len = strlen(mod->name); - const char *name = trace_probe_symbol(tp); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; -} - -static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) -{ - return !!strchr(trace_probe_symbol(tp), ':'); -} - -static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); - -static DEFINE_MUTEX(probe_lock); -static LIST_HEAD(probe_list); - -static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); -static int kretprobe_dispatcher(struct kretprobe_instance *ri, - struct pt_regs *regs); - -/* Check the name is good for event/group/fields */ -static int is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return 0; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; - } - return 1; -} - -/* - * Allocate new trace_probe and initialize it (including kprobes). - */ -static struct trace_probe *alloc_trace_probe(const char *group, - const char *event, - void *addr, - const char *symbol, - unsigned long offs, - int nargs, int is_return) -{ - struct trace_probe *tp; - int ret = -ENOMEM; - - tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); - if (!tp) - return ERR_PTR(ret); - - if (symbol) { - tp->symbol = kstrdup(symbol, GFP_KERNEL); - if (!tp->symbol) - goto error; - tp->rp.kp.symbol_name = tp->symbol; - tp->rp.kp.offset = offs; - } else - tp->rp.kp.addr = addr; - - if (is_return) - tp->rp.handler = kretprobe_dispatcher; - else - tp->rp.kp.pre_handler = kprobe_dispatcher; - - if (!event || !is_good_name(event)) { - ret = -EINVAL; - goto error; - } - - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) - goto error; - - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) - goto error; - - INIT_LIST_HEAD(&tp->list); - return tp; -error: - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); - return ERR_PTR(ret); -} - -static void update_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - -static void free_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); - kfree(arg->name); - kfree(arg->comm); -} - -static void free_trace_probe(struct trace_probe *tp) -{ - int i; - - for (i = 0; i < tp->nr_args; i++) - free_probe_arg(&tp->args[i]); - - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); -} - -static struct trace_probe *find_trace_probe(const char *event, - const char *group) -{ - struct trace_probe *tp; - - list_for_each_entry(tp, &probe_list, list) - if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.class->system, group) == 0) - return tp; - return NULL; -} - -/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static int enable_trace_probe(struct trace_probe *tp, int flag) -{ - int ret = 0; - - tp->flags |= flag; - if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && - !trace_probe_has_gone(tp)) { - if (trace_probe_is_return(tp)) - ret = enable_kretprobe(&tp->rp); - else - ret = enable_kprobe(&tp->rp.kp); - } - - return ret; -} - -/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static void disable_trace_probe(struct trace_probe *tp, int flag) -{ - tp->flags &= ~flag; - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); - else - disable_kprobe(&tp->rp.kp); - } -} - -/* Internal register function - just handle k*probes and flags */ -static int __register_trace_probe(struct trace_probe *tp) -{ - int i, ret; - - if (trace_probe_is_registered(tp)) - return -EINVAL; - - for (i = 0; i < tp->nr_args; i++) - update_probe_arg(&tp->args[i]); - - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(tp)) - tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; - else - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; - - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); - else - ret = register_kprobe(&tp->rp.kp); - - if (ret == 0) - tp->flags |= TP_FLAG_REGISTERED; - else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_probe_symbol(tp), trace_probe_offset(tp), ret); - if (ret == -ENOENT && trace_probe_is_on_module(tp)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); - ret = 0; - } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tp->rp.kp.addr); - ret = -EINVAL; - } - } - - return ret; -} - -/* Internal unregister function - just handle k*probes and flags */ -static void __unregister_trace_probe(struct trace_probe *tp) -{ - if (trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); - else - unregister_kprobe(&tp->rp.kp); - tp->flags &= ~TP_FLAG_REGISTERED; - /* Cleanup kprobe for reuse */ - if (tp->rp.kp.symbol_name) - tp->rp.kp.addr = NULL; - } -} - -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static int unregister_trace_probe(struct trace_probe *tp) -{ - /* Enabled event can not be unregistered */ - if (trace_probe_is_enabled(tp)) - return -EBUSY; - - __unregister_trace_probe(tp); - list_del(&tp->list); - unregister_probe_event(tp); - - return 0; -} - -/* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) -{ - struct trace_probe *old_tp; - int ret; - - mutex_lock(&probe_lock); - - /* Delete old (same name) event if exist */ - old_tp = find_trace_probe(tp->call.name, tp->call.class->system); - if (old_tp) { - ret = unregister_trace_probe(old_tp); - if (ret < 0) - goto end; - free_trace_probe(old_tp); - } - - /* Register new event */ - ret = register_probe_event(tp); - if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); - goto end; - } - - /* Register k*probe */ - ret = __register_trace_probe(tp); - if (ret < 0) - unregister_probe_event(tp); - else - list_add_tail(&tp->list, &probe_list); - -end: - mutex_unlock(&probe_lock); - return ret; -} - -/* Module notifier call back, checking event on the module */ -static int trace_probe_module_callback(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct module *mod = data; - struct trace_probe *tp; - int ret; - - if (val != MODULE_STATE_COMING) - return NOTIFY_DONE; - - /* Update probes on coming module */ - mutex_lock(&probe_lock); - list_for_each_entry(tp, &probe_list, list) { - if (trace_probe_within_module(tp, mod)) { - /* Don't need to check busy - this should have gone. */ - __unregister_trace_probe(tp); - ret = __register_trace_probe(tp); - if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - tp->call.name, mod->name, ret); - } - } - mutex_unlock(&probe_lock); - - return NOTIFY_DONE; -} - -static struct notifier_block trace_probe_module_nb = { - .notifier_call = trace_probe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ -}; - -/* Split symbol and offset. */ -static int split_symbol_offset(char *symbol, unsigned long *offset) -{ - char *tmp; - int ret; - - if (!offset) - return -EINVAL; - - tmp = strchr(symbol, '+'); - if (tmp) { - /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtoul(tmp + 1, 0, offset); - if (ret) - return ret; - *tmp = '\0'; - } else - *offset = 0; - return 0; -} - -#define PARAM_MAX_ARGS 16 -#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) - -static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - - if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; - else - ret = -EINVAL; - } else if (strncmp(arg, "stack", 5) == 0) { - if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; - else - ret = -EINVAL; - } else if (isdigit(arg[5])) { - ret = strict_strtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) - ret = -EINVAL; - else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; - } - } else - ret = -EINVAL; - } else - ret = -EINVAL; - return ret; -} - -/* Recursive argument parser */ -static int __parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - long offset; - char *tmp; - - switch (arg[0]) { - case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); - break; - case '%': /* named register */ - ret = regs_query_register_offset(arg + 1); - if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; - ret = 0; - } - break; - case '@': /* memory or symbol */ - if (isdigit(arg[1])) { - ret = strict_strtoul(arg + 1, 0, ¶m); - if (ret) - break; - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; - } else { - ret = split_symbol_offset(arg + 1, &offset); - if (ret) - break; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; - } - break; - case '+': /* deref memory */ - arg++; /* Skip '+', because strict_strtol() rejects it. */ - case '-': - tmp = strchr(arg, '('); - if (!tmp) - break; - *tmp = '\0'; - ret = strict_strtol(arg, 0, &offset); - if (ret) - break; - arg = tmp + 1; - tmp = strrchr(arg, ')'); - if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2 = find_fetch_type(NULL); - *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), - GFP_KERNEL); - if (!dprm) - return -ENOMEM; - dprm->offset = offset; - ret = __parse_probe_arg(arg, t2, &dprm->orig, - is_return); - if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } - } - break; - } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", - t->name); - ret = -EINVAL; - } - return ret; -} - -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - -/* Bitfield type needs to be parsed into a fetch function */ -static int __parse_bitfield_probe_arg(const char *bf, - const struct fetch_type *t, - struct fetch_param *f) -{ - struct bitfield_fetch_param *bprm; - unsigned long bw, bo; - char *tail; - - if (*bf != 'b') - return 0; - - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; - - bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ - if (bw == 0 || *tail != '@') - return -EINVAL; - - bf = tail + 1; - bo = simple_strtoul(bf, &tail, 0); - if (tail == bf || *tail != '/') - return -EINVAL; - - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; - return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; -} - -/* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct trace_probe *tp, - struct probe_arg *parg, int is_return) -{ - const char *t; - int ret; - - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; - } - parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); - return -ENOMEM; - } - t = strchr(parg->comm, ':'); - if (t) { - arg[t - parg->comm] = '\0'; - t++; - } - parg->type = find_fetch_type(t); - if (!parg->type) { - pr_info("Unsupported type: %s\n", t); - return -EINVAL; - } - parg->offset = tp->size; - tp->size += parg->type->size; - ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); - parg->fetch_size.data = parg->fetch.data; - } - return ret; -} - -/* Return 1 if name is reserved or already used by another argument */ -static int conflict_field_name(const char *name, - struct probe_arg *args, int narg) -{ - int i; - for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) - if (strcmp(reserved_field_names[i], name) == 0) - return 1; - for (i = 0; i < narg; i++) - if (strcmp(args[i].name, name) == 0) - return 1; - return 0; -} - -static int create_trace_probe(int argc, char **argv) -{ - /* - * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] - * Fetch args: - * $retval : fetch return value - * $stack : fetch stack address - * $stackN : fetch Nth of stack (N:0-) - * @ADDR : fetch memory at ADDR (ADDR should be in kernel) - * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) - * %REG : fetch register REG - * Dereferencing memory fetch: - * +|-offs(ARG) : fetch memory at ARG +|- offs address. - * Alias name of args: - * NAME=FETCHARG : set NAME as alias of FETCHARG. - * Type of args: - * FETCHARG:TYPE : use TYPE instead of unsigned long. - */ - struct trace_probe *tp; - int i, ret = 0; - int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *group = NULL; - char *arg; - unsigned long offset = 0; - void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - - /* argc must be >= 1 */ - if (argv[0][0] == 'p') - is_return = 0; - else if (argv[0][0] == 'r') - is_return = 1; - else if (argv[0][0] == '-') - is_delete = 1; - else { - pr_info("Probe definition must be started with 'p', 'r' or" - " '-'.\n"); - return -EINVAL; - } - - if (argv[0][1] == ':') { - event = &argv[0][2]; - if (strchr(event, '/')) { - group = event; - event = strchr(group, '/') + 1; - event[-1] = '\0'; - if (strlen(group) == 0) { - pr_info("Group name is not specified\n"); - return -EINVAL; - } - } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); - return -EINVAL; - } - } - if (!group) - group = KPROBE_EVENT_SYSTEM; - - if (is_delete) { - if (!event) { - pr_info("Delete command needs an event name.\n"); - return -EINVAL; - } - mutex_lock(&probe_lock); - tp = find_trace_probe(event, group); - if (!tp) { - mutex_unlock(&probe_lock); - pr_info("Event %s/%s doesn't exist.\n", group, event); - return -ENOENT; - } - /* delete an event */ - ret = unregister_trace_probe(tp); - if (ret == 0) - free_trace_probe(tp); - mutex_unlock(&probe_lock); - return ret; - } - - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } - /* an address specified */ - ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { - /* a symbol specified */ - symbol = argv[1]; - /* TODO: support .init module functions */ - ret = split_symbol_offset(symbol, &offset); - if (ret) { - pr_info("Failed to parse symbol.\n"); - return ret; - } - if (offset && is_return) { - pr_info("Return probe must be used without offset.\n"); - return -EINVAL; - } - } - argc -= 2; argv += 2; - - /* setup a probe */ - if (!event) { - /* Make a new event name */ - if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", - is_return ? 'r' : 'p', symbol, offset); - else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", - is_return ? 'r' : 'p', addr); - event = buf; - } - tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, - is_return); - if (IS_ERR(tp)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tp)); - return PTR_ERR(tp); - } - - /* parse arguments */ - ret = 0; - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - /* Increment count for freeing args in error case */ - tp->nr_args++; - - /* Parse argument name */ - arg = strchr(argv[i], '='); - if (arg) { - *arg++ = '\0'; - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); - } else { - arg = argv[i]; - /* If argument name is omitted, set "argN" */ - snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tp->args[i].name = kstrdup(buf, GFP_KERNEL); - } - - if (!tp->args[i].name) { - pr_info("Failed to allocate argument[%d] name.\n", i); - ret = -ENOMEM; - goto error; - } - - if (!is_good_name(tp->args[i].name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, tp->args[i].name); - ret = -EINVAL; - goto error; - } - - if (conflict_field_name(tp->args[i].name, tp->args, i)) { - pr_info("Argument[%d] name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - - /* Parse fetch argument */ - ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); - if (ret) { - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); - goto error; - } - } - - ret = register_trace_probe(tp); - if (ret) - goto error; - return 0; - -error: - free_trace_probe(tp); - return ret; -} - -static int release_all_trace_probes(void) -{ - struct trace_probe *tp; - int ret = 0; - - mutex_lock(&probe_lock); - /* Ensure no probe is in use. */ - list_for_each_entry(tp, &probe_list, list) - if (trace_probe_is_enabled(tp)) { - ret = -EBUSY; - goto end; - } - /* TODO: Use batch unregistration */ - while (!list_empty(&probe_list)) { - tp = list_entry(probe_list.next, struct trace_probe, list); - unregister_trace_probe(tp); - free_trace_probe(tp); - } - -end: - mutex_unlock(&probe_lock); - - return ret; -} - -/* Probes listing interfaces */ -static void *probes_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&probe_lock); - return seq_list_start(&probe_list, *pos); -} - -static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &probe_list, pos); -} - -static void probes_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&probe_lock); -} - -static int probes_seq_show(struct seq_file *m, void *v) -{ - struct trace_probe *tp = v; - int i; - - seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); - - if (!tp->symbol) - seq_printf(m, " 0x%p", tp->rp.kp.addr); - else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", trace_probe_symbol(tp), - tp->rp.kp.offset); - else - seq_printf(m, " %s", trace_probe_symbol(tp)); - - for (i = 0; i < tp->nr_args; i++) - seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); - seq_printf(m, "\n"); - - return 0; -} - -static const struct seq_operations probes_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, - .show = probes_seq_show -}; - -static int probes_open(struct inode *inode, struct file *file) -{ - int ret; - - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_probes(); - if (ret < 0) - return ret; - } - - return seq_open(file, &probes_seq_op); -} - -static int command_trace_probe(const char *buf) -{ - char **argv; - int argc = 0, ret = 0; - - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = create_trace_probe(argc, argv); - - argv_free(argv); - return ret; -} - -#define WRITE_BUFSIZE 4096 - -static ssize_t probes_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *kbuf, *tmp; - int ret; - size_t done; - size_t size; - - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = done = 0; - while (done < count) { - size = count - done; - if (size >= WRITE_BUFSIZE) - size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } - kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); - if (tmp) - *tmp = '\0'; - - ret = command_trace_probe(kbuf); - if (ret) - goto out; - } - ret = done; -out: - kfree(kbuf); - return ret; -} - -static const struct file_operations kprobe_events_ops = { - .owner = THIS_MODULE, - .open = probes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, - .write = probes_write, -}; - -/* Probes profiling interfaces */ -static int probes_profile_seq_show(struct seq_file *m, void *v) -{ - struct trace_probe *tp = v; - - seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, - tp->rp.kp.nmissed); - - return 0; -} - -static const struct seq_operations profile_seq_op = { - .start = probes_seq_start, - .next = probes_seq_next, - .stop = probes_seq_stop, - .show = probes_profile_seq_show -}; - -static int profile_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &profile_seq_op); -} - -static const struct file_operations kprobe_profile_ops = { - .owner = THIS_MODULE, - .open = profile_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, - struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, - struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - -/* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry_head *entry; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - int size, dsize, pc; - unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; - - tp->nhit++; - - local_save_flags(irq_flags); - pc = preempt_count(); - - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; - - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)kp->addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); -} - -/* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry_head *entry; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - int size, pc, dsize; - unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; - - local_save_flags(irq_flags); - pc = preempt_count(); - - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; - - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->func = (unsigned long)tp->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); -} - -/* Event entry printers */ -enum print_line_t -print_kprobe_event(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct kprobe_trace_entry_head *field; - struct trace_seq *s = &iter->seq; - struct trace_probe *tp; - u8 *data; - int i; - - field = (struct kprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); - - if (!trace_seq_printf(s, "%s: (", tp->call.name)) - goto partial; - - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, ")")) - goto partial; - - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto partial; - - if (!trace_seq_puts(s, "\n")) - goto partial; - - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -enum print_line_t -print_kretprobe_event(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct kretprobe_trace_entry_head *field; - struct trace_seq *s = &iter->seq; - struct trace_probe *tp; - u8 *data; - int i; - - field = (struct kretprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); - - if (!trace_seq_printf(s, "%s: (", tp->call.name)) - goto partial; - - if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, " <- ")) - goto partial; - - if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, ")")) - goto partial; - - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto partial; - - if (!trace_seq_puts(s, "\n")) - goto partial; - - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -#undef DEFINE_FIELD -#define DEFINE_FIELD(type, item, name, is_signed) \ - do { \ - ret = trace_define_field(event_call, #type, name, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ - FILTER_OTHER); \ - if (ret) \ - return ret; \ - } while (0) - -static int kprobe_event_define_fields(struct ftrace_event_call *event_call) -{ - int ret, i; - struct kprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; - - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; -} - -static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) -{ - int ret, i; - struct kretprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; - - DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); - DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; -} - -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ - int i; - int pos = 0; - - const char *fmt, *arg; - - if (!trace_probe_is_return(tp)) { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } else { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ - int len; - char *print_fmt; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1); - tp->call.print_fmt = print_fmt; - - return 0; -} - -#ifdef CONFIG_PERF_EVENTS - -/* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, - struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry_head *entry; - struct hlist_head *head; - int size, __size, dsize; - int rctx; - - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; - size = ALIGN(__size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; - - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); - if (!entry) - return; - - entry->ip = (unsigned long)kp->addr; - memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); -} - -/* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry_head *entry; - struct hlist_head *head; - int size, __size, dsize; - int rctx; - - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; - size = ALIGN(__size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; - - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); - if (!entry) - return; - - entry->func = (unsigned long)tp->rp.kp.addr; - entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); -} -#endif /* CONFIG_PERF_EVENTS */ - -static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) -{ - struct trace_probe *tp = (struct trace_probe *)event->data; - - switch (type) { - case TRACE_REG_REGISTER: - return enable_trace_probe(tp, TP_FLAG_TRACE); - case TRACE_REG_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_TRACE); - return 0; - -#ifdef CONFIG_PERF_EVENTS - case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, TP_FLAG_PROFILE); - case TRACE_REG_PERF_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_PROFILE); - return 0; -#endif - } - return 0; -} - -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - - if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(kp, regs); -#ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(kp, regs); -#endif - return 0; /* We don't tweek kernel, so just return 0 */ -} - -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - - if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(ri, regs); -#ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(ri, regs); -#endif - return 0; /* We don't tweek kernel, so just return 0 */ -} - -static struct trace_event_functions kretprobe_funcs = { - .trace = print_kretprobe_event -}; - -static struct trace_event_functions kprobe_funcs = { - .trace = print_kprobe_event -}; - -static int register_probe_event(struct trace_probe *tp) -{ - struct ftrace_event_call *call = &tp->call; - int ret; - - /* Initialize ftrace_event_call */ - INIT_LIST_HEAD(&call->class->fields); - if (trace_probe_is_return(tp)) { - call->event.funcs = &kretprobe_funcs; - call->class->define_fields = kretprobe_event_define_fields; - } else { - call->event.funcs = &kprobe_funcs; - call->class->define_fields = kprobe_event_define_fields; - } - if (set_print_fmt(tp) < 0) - return -ENOMEM; - ret = register_ftrace_event(&call->event); - if (!ret) { - kfree(call->print_fmt); - return -ENODEV; - } - call->flags = 0; - call->class->reg = kprobe_register; - call->data = tp; - ret = trace_add_event_call(call); - if (ret) { - pr_info("Failed to register kprobe event: %s\n", call->name); - kfree(call->print_fmt); - unregister_ftrace_event(&call->event); - } - return ret; -} - -static void unregister_probe_event(struct trace_probe *tp) -{ - /* tp->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tp->call); - kfree(tp->call.print_fmt); -} - -/* Make a debugfs interface for controlling probe points */ -static __init int init_kprobe_trace(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - if (register_module_notifier(&trace_probe_module_nb)) - return -EINVAL; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; - - entry = debugfs_create_file("kprobe_events", 0644, d_tracer, - NULL, &kprobe_events_ops); - - /* Event list interface */ - if (!entry) - pr_warning("Could not create debugfs " - "'kprobe_events' entry\n"); - - /* Profile interface */ - entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, - NULL, &kprobe_profile_ops); - - if (!entry) - pr_warning("Could not create debugfs " - "'kprobe_profile' entry\n"); - return 0; -} -fs_initcall(init_kprobe_trace); - - -#ifdef CONFIG_FTRACE_STARTUP_TEST - -/* - * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. - */ -static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) -{ - return a1 + a2 + a3 + a4 + a5 + a6; -} - -static __init int kprobe_trace_self_tests_init(void) -{ - int ret, warn = 0; - int (*target)(int, int, int, int, int, int); - struct trace_probe *tp; - - target = kprobe_trace_selftest_target; - - pr_info("Testing kprobe tracing: "); - - ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)"); - if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function entry.\n"); - warn++; - } else { - /* Enable trace point */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); - warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); - } - - ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " - "$retval"); - if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function return.\n"); - warn++; - } else { - /* Enable trace point */ - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); - warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); - } - - if (warn) - goto end; - - ret = target(1, 2, 3, 4, 5, 6); - - /* Disable trace points before removing it */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting test probe.\n"); - warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); - - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting 2nd test probe.\n"); - warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); - - ret = command_trace_probe("-:testprobe"); - if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); - warn++; - } - - ret = command_trace_probe("-:testprobe2"); - if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); - warn++; - } - -end: - release_all_trace_probes(); - if (warn) - pr_cont("NG: Some tests are failed. Please check them.\n"); - else - pr_cont("OK\n"); - return 0; -} - -late_initcall(kprobe_trace_self_tests_init); - -#endif -/* - * Memory mapped I/O tracing - * - * Copyright (C) 2008 Pekka Paalanen - */ - -#define DEBUG 1 - -#include -#include -#include -#include -#include - -#include - -#include "trace.h" -#include "trace_output.h" - -struct header_iter { - struct pci_dev *dev; -}; - -static struct trace_array *mmio_trace_array; -static bool overrun_detected; -static unsigned long prev_overruns; -static atomic_t dropped_count; - -static void mmio_reset_data(struct trace_array *tr) -{ - overrun_detected = false; - prev_overruns = 0; - - tracing_reset_online_cpus(tr); -} - -static int mmio_trace_init(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - mmio_trace_array = tr; - - mmio_reset_data(tr); - enable_mmiotrace(); - return 0; -} - -static void mmio_trace_reset(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - - disable_mmiotrace(); - mmio_reset_data(tr); - mmio_trace_array = NULL; -} - -static void mmio_trace_start(struct trace_array *tr) -{ - pr_debug("in %s\n", __func__); - mmio_reset_data(tr); -} - -static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) -{ - int ret = 0; - int i; - resource_size_t start, end; - const struct pci_driver *drv = pci_dev_driver(dev); - - /* XXX: incomplete checks for trace_seq_printf() return value */ - ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", - dev->bus->number, dev->devfn, - dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ - for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", - (unsigned long long)(start | - (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); - } - for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", - dev->resource[i].start < dev->resource[i].end ? - (unsigned long long)(end - start) + 1 : 0); - } - if (drv) - ret += trace_seq_printf(s, " %s\n", drv->name); - else - ret += trace_seq_printf(s, " \n"); - return ret; -} - -static void destroy_header_iter(struct header_iter *hiter) -{ - if (!hiter) - return; - pci_dev_put(hiter->dev); - kfree(hiter); -} - -static void mmio_pipe_open(struct trace_iterator *iter) -{ - struct header_iter *hiter; - struct trace_seq *s = &iter->seq; - - trace_seq_printf(s, "VERSION 20070824\n"); - - hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); - if (!hiter) - return; - - hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); - iter->private = hiter; -} - -/* XXX: This is not called when the pipe is closed! */ -static void mmio_close(struct trace_iterator *iter) -{ - struct header_iter *hiter = iter->private; - destroy_header_iter(hiter); - iter->private = NULL; -} - -static unsigned long count_overruns(struct trace_iterator *iter) -{ - unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->tr->buffer); - - if (over > prev_overruns) - cnt += over - prev_overruns; - prev_overruns = over; - return cnt; -} - -static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, - char __user *ubuf, size_t cnt, loff_t *ppos) -{ - ssize_t ret; - struct header_iter *hiter = iter->private; - struct trace_seq *s = &iter->seq; - unsigned long n; - - n = count_overruns(iter); - if (n) { - /* XXX: This is later than where events were lost. */ - trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); - if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); - overrun_detected = true; - goto print_out; - } - - if (!hiter) - return 0; - - mmio_print_pcidev(s, hiter->dev); - hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); - - if (!hiter->dev) { - destroy_header_iter(hiter); - iter->private = NULL; - } - -print_out: - ret = trace_seq_to_user(s, ubuf, cnt); - return (ret == -EBUSY) ? 0 : ret; -} - -static enum print_line_t mmio_print_rw(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_mmiotrace_rw *field; - struct mmiotrace_rw *rw; - struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned secs = (unsigned long)t; - int ret = 1; - - trace_assign_type(field, entry); - rw = &field->rw; - - switch (rw->opcode) { - case MMIO_READ: - ret = trace_seq_printf(s, - "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - rw->value, rw->pc, 0); - break; - case MMIO_WRITE: - ret = trace_seq_printf(s, - "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - rw->value, rw->pc, 0); - break; - case MMIO_UNKNOWN_OP: - ret = trace_seq_printf(s, - "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," - "%02lx 0x%lx %d\n", - secs, usec_rem, rw->map_id, - (unsigned long long)rw->phys, - (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, - (rw->value >> 0) & 0xff, rw->pc, 0); - break; - default: - ret = trace_seq_printf(s, "rw what?\n"); - break; - } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t mmio_print_map(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_mmiotrace_map *field; - struct mmiotrace_map *m; - struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned secs = (unsigned long)t; - int ret; - - trace_assign_type(field, entry); - m = &field->map; - - switch (m->opcode) { - case MMIO_PROBE: - ret = trace_seq_printf(s, - "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", - secs, usec_rem, m->map_id, - (unsigned long long)m->phys, m->virt, m->len, - 0UL, 0); - break; - case MMIO_UNPROBE: - ret = trace_seq_printf(s, - "UNMAP %u.%06lu %d 0x%lx %d\n", - secs, usec_rem, m->map_id, 0UL, 0); - break; - default: - ret = trace_seq_printf(s, "map what?\n"); - break; - } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t mmio_print_mark(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct print_entry *print = (struct print_entry *)entry; - const char *msg = print->buf; - struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned secs = (unsigned long)t; - int ret; - - /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t mmio_print_line(struct trace_iterator *iter) -{ - switch (iter->ent->type) { - case TRACE_MMIO_RW: - return mmio_print_rw(iter); - case TRACE_MMIO_MAP: - return mmio_print_map(iter); - case TRACE_PRINT: - return mmio_print_mark(iter); - default: - return TRACE_TYPE_HANDLED; /* ignore unknown entries */ - } -} - -static struct tracer mmio_tracer __read_mostly = -{ - .name = "mmiotrace", - .init = mmio_trace_init, - .reset = mmio_trace_reset, - .start = mmio_trace_start, - .pipe_open = mmio_pipe_open, - .close = mmio_close, - .read = mmio_read, - .print_line = mmio_print_line, -}; - -__init static int init_mmio_trace(void) -{ - return register_tracer(&mmio_tracer); -} -device_initcall(init_mmio_trace); - -static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_rw *rw) -{ - struct ftrace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->buffer; - struct ring_buffer_event *event; - struct trace_mmiotrace_rw *entry; - int pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, - sizeof(*entry), 0, pc); - if (!event) { - atomic_inc(&dropped_count); - return; - } - entry = ring_buffer_event_data(event); - entry->rw = *rw; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void mmio_trace_rw(struct mmiotrace_rw *rw) -{ - struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; - __trace_mmiotrace_rw(tr, data, rw); -} - -static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_map *map) -{ - struct ftrace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->buffer; - struct ring_buffer_event *event; - struct trace_mmiotrace_map *entry; - int pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, - sizeof(*entry), 0, pc); - if (!event) { - atomic_inc(&dropped_count); - return; - } - entry = ring_buffer_event_data(event); - entry->map = *map; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void mmio_trace_mapping(struct mmiotrace_map *map) -{ - struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = tr->data[smp_processor_id()]; - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); -} - -int mmio_trace_printk(const char *fmt, va_list args) -{ - return trace_vprintk(0, fmt, args); -} -/* - * nop tracer - * - * Copyright (C) 2008 Steven Noonan - * - */ - -#include -#include -#include -#include - -#include "trace.h" - -/* Our two options */ -enum { - TRACE_NOP_OPT_ACCEPT = 0x1, - TRACE_NOP_OPT_REFUSE = 0x2 -}; - -/* Options for the tracer (see trace_options file) */ -static struct tracer_opt nop_opts[] = { - /* Option that will be accepted by set_flag callback */ - { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, - /* Option that will be refused by set_flag callback */ - { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, - { } /* Always set a last empty entry */ -}; - -static struct tracer_flags nop_flags = { - /* You can check your flags value here when you want. */ - .val = 0, /* By default: all flags disabled */ - .opts = nop_opts -}; - -static struct trace_array *ctx_trace; - -static void start_nop_trace(struct trace_array *tr) -{ - /* Nothing to do! */ -} - -static void stop_nop_trace(struct trace_array *tr) -{ - /* Nothing to do! */ -} - -static int nop_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - start_nop_trace(tr); - return 0; -} - -static void nop_trace_reset(struct trace_array *tr) -{ - stop_nop_trace(tr); -} - -/* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. - * If you don't implement it, then the flag setting will be - * automatically accepted. - */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) -{ - /* - * Note that you don't need to update nop_flags.val yourself. - * The tracing Api will do it automatically if you return 0 - */ - if (bit == TRACE_NOP_OPT_ACCEPT) { - printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." - " Now cat trace_options to see the result\n", - set); - return 0; - } - - if (bit == TRACE_NOP_OPT_REFUSE) { - printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", - set); - return -EINVAL; - } - - return 0; -} - - -struct tracer nop_trace __read_mostly = -{ - .name = "nop", - .init = nop_trace_init, - .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_nop, -#endif - .flags = &nop_flags, - .set_flag = nop_set_flag -}; - -/* - * trace_output.c - * - * Copyright (C) 2008 Red Hat Inc, Steven Rostedt - * - */ - -#include -#include -#include - -#include "trace_output.h" - -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 - -DECLARE_RWSEM(trace_event_mutex); - -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; - -static int next_event_type = __TRACE_LAST_TYPE + 1; - -int trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - int ret; - - ret = seq_write(m, s->buffer, len); - - /* - * Only reset this buffer if we successfully wrote to the - * seq_file buffer. - */ - if (!ret) - trace_seq_init(s); - - return ret; -} - -enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct bprint_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct print_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%s", field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (s->full || !len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_printf); - -/** - * trace_seq_vprintf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = vsnprintf(s->buffer + s->len, len, fmt, args); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} -EXPORT_SYMBOL_GPL(trace_seq_vprintf); - -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) -{ - int len = (PAGE_SIZE - 1) - s->len; - int ret; - - if (s->full || !len) - return 0; - - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { - s->full = 1; - return 0; - } - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -int trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -int trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - s->buffer[s->len++] = c; - - return 1; -} -EXPORT_SYMBOL(trace_seq_putc); - -int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) -{ - if (s->full) - return 0; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return 0; - } - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - int i, j; - - if (s->full) - return 0; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -void *trace_seq_reserve(struct trace_seq *s, size_t len) -{ - void *ret; - - if (s->full) - return NULL; - - if (len > ((PAGE_SIZE - 1) - s->len)) { - s->full = 1; - return NULL; - } - - ret = s->buffer + s->len; - s->len += len; - - return ret; -} - -int trace_seq_path(struct trace_seq *s, struct path *path) -{ - unsigned char *p; - - if (s->full) - return 0; - - if (s->len >= (PAGE_SIZE - 1)) { - s->full = 1; - return 0; - } - - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - s->full = 1; - return 0; -} - -const char * -ftrace_print_flags_seq(struct trace_seq *p, const char *delim, - unsigned long flags, - const struct trace_print_flags *flag_array) -{ - unsigned long mask; - const char *str; - const char *ret = p->buffer + p->len; - int i; - - for (i = 0; flag_array[i].name && flags; i++) { - - mask = flag_array[i].mask; - if ((flags & mask) != mask) - continue; - - str = flag_array[i].name; - flags &= ~mask; - if (p->len && delim) - trace_seq_puts(p, delim); - trace_seq_puts(p, str); - } - - /* check for left over flags */ - if (flags) { - if (p->len && delim) - trace_seq_puts(p, delim); - trace_seq_printf(p, "0x%lx", flags); - } - - trace_seq_putc(p, 0); - - return ret; -} -EXPORT_SYMBOL(ftrace_print_flags_seq); - -const char * -ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) -{ - int i; - const char *ret = p->buffer + p->len; - - for (i = 0; symbol_array[i].name; i++) { - - if (val != symbol_array[i].mask) - continue; - - trace_seq_puts(p, symbol_array[i].name); - break; - } - - if (!p->len) - trace_seq_printf(p, "0x%lx", val); - - trace_seq_putc(p, 0); - - return ret; -} -EXPORT_SYMBOL(ftrace_print_symbols_seq); - -#if BITS_PER_LONG == 32 -const char * -ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) -{ - int i; - const char *ret = p->buffer + p->len; - - for (i = 0; symbol_array[i].name; i++) { - - if (val != symbol_array[i].mask) - continue; - - trace_seq_puts(p, symbol_array[i].name); - break; - } - - if (!p->len) - trace_seq_printf(p, "0x%llx", val); - - trace_seq_putc(p, 0); - - return ret; -} -EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); -#endif - -const char * -ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) -{ - int i; - const char *ret = p->buffer + p->len; - - for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - - trace_seq_putc(p, 0); - - return ret; -} -EXPORT_SYMBOL(ftrace_print_hex_seq); - -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) -{ - static const char tramp_name[] = "kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) - return "[unknown/kretprobe'd]"; - return name; -} -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ - -static int -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -static int -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - sprint_symbol(str, address); - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) -{ - struct file *file = NULL; - unsigned long vmstart = 0; - int ret = 1; - - if (s->full) - return 0; - - if (mm) { - const struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ip); - if (vma) { - file = vma->vm_file; - vmstart = vma->vm_start; - } - if (file) { - ret = trace_seq_path(s, &file->f_path); - if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", - ip - vmstart); - } - up_read(&mm->mmap_sem); - } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - int ret = 1; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || !ret) - break; - if (ret) - ret = trace_seq_puts(s, " => "); - if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - if (ret) - ret = trace_seq_puts(s, "\n"); - continue; - } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_puts(s, "\n"); - } - - if (mm) - mmput(mm); - return ret; -} - -int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) -{ - int ret; - - if (!ip) - return trace_seq_printf(s, "0"); - - if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); - else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; - - if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -/** - * trace_print_lat_fmt - print the irq, preempt and lockdep fields - * @s: trace seq struct to write to - * @entry: The trace entry field from the ring buffer - * - * Prints the generic fields of irqs off, in hard or softirq, preempt - * count. - */ -int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) -{ - char hardsoft_irq; - char need_resched; - char irqs_off; - int hardirq; - int softirq; - int ret; - - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - - irqs_off = - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : - '.'; - need_resched = - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; - hardsoft_irq = - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; - - if (!trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq)) - return 0; - - if (entry->preempt_count) - ret = trace_seq_printf(s, "%x", entry->preempt_count); - else - ret = trace_seq_putc(s, '.'); - - return ret; -} - -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) -{ - char comm[TASK_COMM_LEN]; - - trace_find_cmdline(entry->pid, comm); - - if (!trace_seq_printf(s, "%8.8s-%-5d %3d", - comm, entry->pid, cpu)) - return 0; - - return trace_print_lat_fmt(s, entry); -} - -static unsigned long preempt_mark_thresh = 100; - -static int -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) -{ - return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, - rel_usecs > preempt_mark_thresh ? '!' : - rel_usecs > 1 ? '+' : ' '); -} - -int trace_print_context(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned long secs = (unsigned long)t; - char comm[TASK_COMM_LEN]; - int ret; - - trace_find_cmdline(entry->pid, comm); - - ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); - if (!ret) - return 0; - - if (trace_flags & TRACE_ITER_IRQ_INFO) { - ret = trace_print_lat_fmt(s, entry); - if (!ret) - return 0; - } - - return trace_seq_printf(s, " %5lu.%06lu: ", - secs, usec_rem); -} - -int trace_print_lat_context(struct trace_iterator *iter) -{ - u64 next_ts; - int ret; - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent, - *next_entry = trace_find_next_entry(iter, NULL, - &next_ts); - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); - unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - unsigned long rel_usecs; - - if (!next_entry) - next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); - - if (verbose) { - char comm[TASK_COMM_LEN]; - - trace_find_cmdline(entry->pid, comm); - - ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" - " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx, - ns2usecs(iter->ts), - abs_usecs / USEC_PER_MSEC, - abs_usecs % USEC_PER_MSEC, - rel_usecs / USEC_PER_MSEC, - rel_usecs % USEC_PER_MSEC); - } else { - ret = lat_print_generic(s, entry, iter->cpu); - if (ret) - ret = lat_print_timestamp(s, abs_usecs, rel_usecs); - } - - return ret; -} - -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - -/** - * ftrace_find_event - find a registered event - * @type: the type of event to look for - * - * Returns an event of type @type otherwise NULL - * Called with trace_event_read_lock() held. - */ -struct trace_event *ftrace_find_event(int type) -{ - struct trace_event *event; - struct hlist_node *n; - unsigned key; - - key = type & (EVENT_HASHSIZE - 1); - - hlist_for_each_entry(event, n, &event_hash[key], node) { - if (event->type == type) - return event; - } - - return NULL; -} - -static LIST_HEAD(ftrace_event_list); - -static int trace_search_list(struct list_head **list) -{ - struct trace_event *e; - int last = __TRACE_LAST_TYPE; - - if (list_empty(&ftrace_event_list)) { - *list = &ftrace_event_list; - return last + 1; - } - - /* - * We used up all possible max events, - * lets see if somebody freed one. - */ - list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != last + 1) - break; - last++; - } - - /* Did we used up all 65 thousand events??? */ - if ((last + 1) > FTRACE_MAX_EVENT) - return 0; - - *list = &e->list; - return last + 1; -} - -void trace_event_read_lock(void) -{ - down_read(&trace_event_mutex); -} - -void trace_event_read_unlock(void) -{ - up_read(&trace_event_mutex); -} - -/** - * register_ftrace_event - register output for an event type - * @event: the event type to register - * - * Event types are stored in a hash and this hash is used to - * find a way to print an event. If the @event->type is set - * then it will use that type, otherwise it will assign a - * type to use. - * - * If you assign your own type, please make sure it is added - * to the trace_type enum in trace.h, to avoid collisions - * with the dynamic types. - * - * Returns the event type number or zero on error. - */ -int register_ftrace_event(struct trace_event *event) -{ - unsigned key; - int ret = 0; - - down_write(&trace_event_mutex); - - if (WARN_ON(!event)) - goto out; - - if (WARN_ON(!event->funcs)) - goto out; - - INIT_LIST_HEAD(&event->list); - - if (!event->type) { - struct list_head *list = NULL; - - if (next_event_type > FTRACE_MAX_EVENT) { - - event->type = trace_search_list(&list); - if (!event->type) - goto out; - - } else { - - event->type = next_event_type++; - list = &ftrace_event_list; - } - - if (WARN_ON(ftrace_find_event(event->type))) - goto out; - - list_add_tail(&event->list, list); - - } else if (event->type > __TRACE_LAST_TYPE) { - printk(KERN_WARNING "Need to add type to trace.h\n"); - WARN_ON(1); - goto out; - } else { - /* Is this event already used */ - if (ftrace_find_event(event->type)) - goto out; - } - - if (event->funcs->trace == NULL) - event->funcs->trace = trace_nop_print; - if (event->funcs->raw == NULL) - event->funcs->raw = trace_nop_print; - if (event->funcs->hex == NULL) - event->funcs->hex = trace_nop_print; - if (event->funcs->binary == NULL) - event->funcs->binary = trace_nop_print; - - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); - - ret = event->type; - out: - up_write(&trace_event_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(register_ftrace_event); - -/* - * Used by module code with the trace_event_mutex held for write. - */ -int __unregister_ftrace_event(struct trace_event *event) -{ - hlist_del(&event->node); - list_del(&event->list); - return 0; -} - -/** - * unregister_ftrace_event - remove a no longer used event - * @event: the event to remove - */ -int unregister_ftrace_event(struct trace_event *event) -{ - down_write(&trace_event_mutex); - __unregister_ftrace_event(event); - up_write(&trace_event_mutex); - - return 0; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_event); - -/* - * Standard events - */ - -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -/* TRACE_FN */ -static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct ftrace_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_printf(s, " <-")) - goto partial; - if (!seq_print_ip_sym(s, - field->parent_ip, - flags)) - goto partial; - } - if (!trace_seq_printf(s, "\n")) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct ftrace_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "%lx %lx\n", - field->ip, - field->parent_ip)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct ftrace_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct ftrace_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_fn_funcs = { - .trace = trace_fn_trace, - .raw = trace_fn_raw, - .hex = trace_fn_hex, - .binary = trace_fn_bin, -}; - -static struct trace_event trace_fn_event = { - .type = TRACE_FN, - .funcs = &trace_fn_funcs, -}; - -/* TRACE_CTX an TRACE_WAKE */ -static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, - char *delim) -{ - struct ctx_switch_entry *field; - char comm[TASK_COMM_LEN]; - int S, T; - - - trace_assign_type(field, iter->ent); - - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); - trace_find_cmdline(field->next_pid, comm); - if (!trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, delim, - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return trace_ctxwake_print(iter, "==>"); -} - -static enum print_line_t trace_wake_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - return trace_ctxwake_print(iter, " +"); -} - -static int trace_ctxwake_raw(struct trace_iterator *iter, char S) -{ - struct ctx_switch_entry *field; - int T; - - trace_assign_type(field, iter->ent); - - if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); - if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return trace_ctxwake_raw(iter, 0); -} - -static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return trace_ctxwake_raw(iter, '+'); -} - - -static int trace_ctxwake_hex(struct trace_iterator *iter, char S) -{ - struct ctx_switch_entry *field; - struct trace_seq *s = &iter->seq; - int T; - - trace_assign_type(field, iter->ent); - - if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); - - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return trace_ctxwake_hex(iter, 0); -} - -static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return trace_ctxwake_hex(iter, '+'); -} - -static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct ctx_switch_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_ctx_funcs = { - .trace = trace_ctx_print, - .raw = trace_ctx_raw, - .hex = trace_ctx_hex, - .binary = trace_ctxwake_bin, -}; - -static struct trace_event trace_ctx_event = { - .type = TRACE_CTX, - .funcs = &trace_ctx_funcs, -}; - -static struct trace_event_functions trace_wake_funcs = { - .trace = trace_wake_print, - .raw = trace_wake_raw, - .hex = trace_wake_hex, - .binary = trace_ctxwake_bin, -}; - -static struct trace_event trace_wake_event = { - .type = TRACE_WAKE, - .funcs = &trace_wake_funcs, -}; - -/* TRACE_STACK */ - -static enum print_line_t trace_stack_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct stack_entry *field; - struct trace_seq *s = &iter->seq; - unsigned long *p; - unsigned long *end; - - trace_assign_type(field, iter->ent); - end = (unsigned long *)((long)iter->ent + iter->ent_size); - - if (!trace_seq_puts(s, "\n")) - goto partial; - - for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { - if (!trace_seq_puts(s, " => ")) - goto partial; - - if (!seq_print_ip_sym(s, *p, flags)) - goto partial; - if (!trace_seq_puts(s, "\n")) - goto partial; - } - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event_functions trace_stack_funcs = { - .trace = trace_stack_print, -}; - -static struct trace_event trace_stack_event = { - .type = TRACE_STACK, - .funcs = &trace_stack_funcs, -}; - -/* TRACE_USER_STACK */ -static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct userstack_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_puts(s, "\n")) - goto partial; - - if (!seq_print_userip_objs(field, s, flags)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event_functions trace_user_stack_funcs = { - .trace = trace_user_stack_print, -}; - -static struct trace_event trace_user_stack_event = { - .type = TRACE_USER_STACK, - .funcs = &trace_user_stack_funcs, -}; - -/* TRACE_BPRINT */ -static enum print_line_t -trace_bprint_print(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct bprint_entry *field; - - trace_assign_type(field, entry); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - - -static enum print_line_t -trace_bprint_raw(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct bprint_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event_functions trace_bprint_funcs = { - .trace = trace_bprint_print, - .raw = trace_bprint_raw, -}; - -static struct trace_event trace_bprint_event = { - .type = TRACE_BPRINT, - .funcs = &trace_bprint_funcs, -}; - -/* TRACE_PRINT */ -static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct print_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct print_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event_functions trace_print_funcs = { - .trace = trace_print_print, - .raw = trace_print_raw, -}; - -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, - .funcs = &trace_print_funcs, -}; - - -static struct trace_event *events[] __initdata = { - &trace_fn_event, - &trace_ctx_event, - &trace_wake_event, - &trace_stack_event, - &trace_user_stack_event, - &trace_bprint_event, - &trace_print_event, - NULL -}; - -__init static int init_events(void) -{ - struct trace_event *event; - int i, ret; - - for (i = 0; events[i]; i++) { - event = events[i]; - - ret = register_ftrace_event(event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - event->type); - WARN_ON_ONCE(1); - } - } - - return 0; -} -device_initcall(init_events); -/* - * trace binary printk - * - * Copyright (C) 2008 Lai Jiangshan - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -#ifdef CONFIG_MODULES - -/* - * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt - * which are queued on trace_bprintk_fmt_list. - */ -static LIST_HEAD(trace_bprintk_fmt_list); - -/* serialize accesses to trace_bprintk_fmt_list */ -static DEFINE_MUTEX(btrace_mutex); - -struct trace_bprintk_fmt { - struct list_head list; - const char *fmt; -}; - -static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) -{ - struct trace_bprintk_fmt *pos; - list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { - if (!strcmp(pos->fmt, fmt)) - return pos; - } - return NULL; -} - -static -void hold_module_trace_bprintk_format(const char **start, const char **end) -{ - const char **iter; - char *fmt; - - mutex_lock(&btrace_mutex); - for (iter = start; iter < end; iter++) { - struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); - if (tb_fmt) { - *iter = tb_fmt->fmt; - continue; - } - - fmt = NULL; - tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); - if (tb_fmt) { - fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); - if (fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(fmt, *iter); - tb_fmt->fmt = fmt; - } else - kfree(tb_fmt); - } - *iter = fmt; - - } - mutex_unlock(&btrace_mutex); -} - -static int module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - if (mod->num_trace_bprintk_fmt) { - const char **start = mod->trace_bprintk_fmt_start; - const char **end = start + mod->num_trace_bprintk_fmt; - - if (val == MODULE_STATE_COMING) - hold_module_trace_bprintk_format(start, end); - } - return 0; -} - -/* - * The debugfs/tracing/printk_formats file maps the addresses with - * the ASCII formats that are used in the bprintk events in the - * buffer. For userspace tools to be able to decode the events from - * the buffer, they need to be able to map the address with the format. - * - * The addresses of the bprintk formats are in their own section - * __trace_printk_fmt. But for modules we copy them into a link list. - * The code to print the formats and their addresses passes around the - * address of the fmt string. If the fmt address passed into the seq - * functions is within the kernel core __trace_printk_fmt section, then - * it simply uses the next pointer in the list. - * - * When the fmt pointer is outside the kernel core __trace_printk_fmt - * section, then we need to read the link list pointers. The trick is - * we pass the address of the string to the seq function just like - * we do for the kernel core formats. To get back the structure that - * holds the format, we simply use containerof() and then go to the - * next format in the list. - */ -static const char ** -find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) -{ - struct trace_bprintk_fmt *mod_fmt; - - if (list_empty(&trace_bprintk_fmt_list)) - return NULL; - - /* - * v will point to the address of the fmt record from t_next - * v will be NULL from t_start. - * If this is the first pointer or called from start - * then we need to walk the list. - */ - if (!v || start_index == *pos) { - struct trace_bprintk_fmt *p; - - /* search the module list */ - list_for_each_entry(p, &trace_bprintk_fmt_list, list) { - if (start_index == *pos) - return &p->fmt; - start_index++; - } - /* pos > index */ - return NULL; - } - - /* - * v points to the address of the fmt field in the mod list - * structure that holds the module print format. - */ - mod_fmt = container_of(v, typeof(*mod_fmt), fmt); - if (mod_fmt->list.next == &trace_bprintk_fmt_list) - return NULL; - - mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); - - return &mod_fmt->fmt; -} - -static void format_mod_start(void) -{ - mutex_lock(&btrace_mutex); -} - -static void format_mod_stop(void) -{ - mutex_unlock(&btrace_mutex); -} - -#else /* !CONFIG_MODULES */ -__init static int -module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -static inline const char ** -find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) -{ - return NULL; -} -static inline void format_mod_start(void) { } -static inline void format_mod_stop(void) { } -#endif /* CONFIG_MODULES */ - - -__initdata_or_module static -struct notifier_block module_trace_bprintk_format_nb = { - .notifier_call = module_trace_bprintk_format_notify, -}; - -int __trace_bprintk(unsigned long ip, const char *fmt, ...) - { - int ret; - va_list ap; - - if (unlikely(!fmt)) - return 0; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_vbprintk(ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_bprintk); - -int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) - { - if (unlikely(!fmt)) - return 0; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - return trace_vbprintk(ip, fmt, ap); -} -EXPORT_SYMBOL_GPL(__ftrace_vbprintk); - -int __trace_printk(unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_vprintk(ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_printk); - -int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) -{ - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - return trace_vprintk(ip, fmt, ap); -} -EXPORT_SYMBOL_GPL(__ftrace_vprintk); - -static const char **find_next(void *v, loff_t *pos) -{ - const char **fmt = v; - int start_index; - - start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; - - if (*pos < start_index) - return __start___trace_bprintk_fmt + *pos; - - return find_next_mod_format(start_index, v, fmt, pos); -} - -static void * -t_start(struct seq_file *m, loff_t *pos) -{ - format_mod_start(); - return find_next(NULL, pos); -} - -static void *t_next(struct seq_file *m, void * v, loff_t *pos) -{ - (*pos)++; - return find_next(v, pos); -} - -static int t_show(struct seq_file *m, void *v) -{ - const char **fmt = v; - const char *str = *fmt; - int i; - - seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); - - /* - * Tabs and new lines need to be converted. - */ - for (i = 0; str[i]; i++) { - switch (str[i]) { - case '\n': - seq_puts(m, "\\n"); - break; - case '\t': - seq_puts(m, "\\t"); - break; - case '\\': - seq_puts(m, "\\"); - break; - case '"': - seq_puts(m, "\\\""); - break; - default: - seq_putc(m, str[i]); - } - } - seq_puts(m, "\"\n"); - - return 0; -} - -static void t_stop(struct seq_file *m, void *p) -{ - format_mod_stop(); -} - -static const struct seq_operations show_format_seq_ops = { - .start = t_start, - .next = t_next, - .show = t_show, - .stop = t_stop, -}; - -static int -ftrace_formats_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &show_format_seq_ops); -} - -static const struct file_operations ftrace_formats_fops = { - .open = ftrace_formats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static __init int init_trace_printk_function_export(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; - - trace_create_file("printk_formats", 0444, d_tracer, - NULL, &ftrace_formats_fops); - - return 0; -} - -fs_initcall(init_trace_printk_function_export); - -static __init int init_trace_printk(void) -{ - return register_module_notifier(&module_trace_bprintk_format_nb); -} - -early_initcall(init_trace_printk); -/* - * trace context switch - * - * Copyright (C) 2007 Steven Rostedt - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *ctx_trace; -static int __read_mostly tracer_enabled; -static int sched_ref; -static DEFINE_MUTEX(sched_register_mutex); -static int sched_stopped; - - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->buffer; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); -} - -static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) -{ - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (unlikely(!sched_ref)) - return; - - tracing_record_cmdline(prev); - tracing_record_cmdline(next); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); - - local_irq_restore(flags); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->buffer; - - event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr->buffer, flags, 6, pc); - ftrace_trace_userstack(tr->buffer, flags, pc); -} - -static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) -{ - struct trace_array_cpu *data; - unsigned long flags; - int cpu, pc; - - if (unlikely(!sched_ref)) - return; - - tracing_record_cmdline(current); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_wakeup_trace(ctx_trace, wakee, current, - flags, pc); - - local_irq_restore(flags); -} - -static int tracing_sched_register(void) -{ - int ret; - - ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't activate tracepoint" - " probe to kernel_sched_wakeup\n"); - return ret; - } - - ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't activate tracepoint" - " probe to kernel_sched_wakeup_new\n"); - goto fail_deprobe; - } - - ret = register_trace_sched_switch(probe_sched_switch, NULL); - if (ret) { - pr_info("sched trace: Couldn't activate tracepoint" - " probe to kernel_sched_switch\n"); - goto fail_deprobe_wake_new; - } - - return ret; -fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); -fail_deprobe: - unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); - return ret; -} - -static void tracing_sched_unregister(void) -{ - unregister_trace_sched_switch(probe_sched_switch, NULL); - unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); - unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); -} - -static void tracing_start_sched_switch(void) -{ - mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) - tracing_sched_register(); - mutex_unlock(&sched_register_mutex); -} - -static void tracing_stop_sched_switch(void) -{ - mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) - tracing_sched_unregister(); - mutex_unlock(&sched_register_mutex); -} - -void tracing_start_cmdline_record(void) -{ - tracing_start_sched_switch(); -} - -void tracing_stop_cmdline_record(void) -{ - tracing_stop_sched_switch(); -} - -/** - * tracing_start_sched_switch_record - start tracing context switches - * - * Turns on context switch tracing for a tracer. - */ -void tracing_start_sched_switch_record(void) -{ - if (unlikely(!ctx_trace)) { - WARN_ON(1); - return; - } - - tracing_start_sched_switch(); - - mutex_lock(&sched_register_mutex); - tracer_enabled++; - mutex_unlock(&sched_register_mutex); -} - -/** - * tracing_stop_sched_switch_record - start tracing context switches - * - * Turns off context switch tracing for a tracer. - */ -void tracing_stop_sched_switch_record(void) -{ - mutex_lock(&sched_register_mutex); - tracer_enabled--; - WARN_ON(tracer_enabled < 0); - mutex_unlock(&sched_register_mutex); - - tracing_stop_sched_switch(); -} - -/** - * tracing_sched_switch_assign_trace - assign a trace array for ctx switch - * @tr: trace array pointer to assign - * - * Some tracers might want to record the context switches in their - * trace. This function lets those tracers assign the trace array - * to use. - */ -void tracing_sched_switch_assign_trace(struct trace_array *tr) -{ - ctx_trace = tr; -} - -/* - * trace task wakeup timings - * - * Copyright (C) 2007-2008 Steven Rostedt - * Copyright (C) 2008 Ingo Molnar - * - * Based on code from the latency_tracer, that is: - * - * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III - */ -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -static struct trace_array *wakeup_trace; -static int __read_mostly tracer_enabled; - -static struct task_struct *wakeup_task; -static int wakeup_cpu; -static int wakeup_current_cpu; -static unsigned wakeup_prio = -1; -static int wakeup_rt; - -static arch_spinlock_t wakeup_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - -static void wakeup_reset(struct trace_array *tr); -static void __wakeup_reset(struct trace_array *tr); -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); - -static int save_lat_flag; - -#define TRACE_DISPLAY_GRAPH 1 - -static struct tracer_opt trace_opts[] = { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, -#endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) - -#ifdef CONFIG_FUNCTION_TRACER - -/* - * Prologue for the wakeup function tracers. - * - * Returns 1 if it is OK to continue, and preemption - * is disabled and data->disabled is incremented. - * 0 if the trace is to be ignored, and preemption - * is not disabled and data->disabled is - * kept the same. - * - * Note, this function is also used outside this ifdef but - * inside the #ifdef of the function graph tracer below. - * This is OK, since the function graph tracer is - * dependent on the function tracer. - */ -static int -func_prolog_preempt_disable(struct trace_array *tr, - struct trace_array_cpu **data, - int *pc) -{ - long disabled; - int cpu; - - if (likely(!wakeup_task)) - return 0; - - *pc = preempt_count(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - if (cpu != wakeup_current_cpu) - goto out_enable; - - *data = tr->data[cpu]; - disabled = atomic_inc_return(&(*data)->disabled); - if (unlikely(disabled != 1)) - goto out; - - return 1; - -out: - atomic_dec(&(*data)->disabled); - -out_enable: - preempt_enable_notrace(); - return 0; -} - -/* - * wakeup uses its own tracer function to keep the overhead down: - */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc; - - if (!func_prolog_preempt_disable(tr, &data, &pc)) - return; - - local_irq_save(flags); - trace_function(tr, ip, parent_ip, flags, pc); - local_irq_restore(flags); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; -#endif /* CONFIG_FUNCTION_TRACER */ - -static int start_func_tracer(int graph) -{ - int ret; - - if (!graph) - ret = register_ftrace_function(&trace_ops); - else - ret = register_ftrace_graph(&wakeup_graph_return, - &wakeup_graph_entry); - - if (!ret && tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; - - return ret; -} - -static void stop_func_tracer(int graph) -{ - tracer_enabled = 0; - - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) -{ - - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) - return 0; - - stop_func_tracer(!set); - - wakeup_reset(wakeup_trace); - tracing_max_latency = 0; - - return start_func_tracer(set); -} - -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc, ret = 0; - - if (!func_prolog_preempt_disable(tr, &data, &pc)) - return 0; - - local_save_flags(flags); - ret = __trace_graph_entry(tr, trace, flags, pc); - atomic_dec(&data->disabled); - preempt_enable_notrace(); - - return ret; -} - -static void wakeup_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; - int pc; - - if (!func_prolog_preempt_disable(tr, &data, &pc)) - return; - - local_save_flags(flags); - __trace_graph_return(tr, trace, flags, pc); - atomic_dec(&data->disabled); - - preempt_enable_notrace(); - return; -} - -static void wakeup_trace_open(struct trace_iterator *iter) -{ - if (is_graph()) - graph_trace_open(iter); -} - -static void wakeup_trace_close(struct trace_iterator *iter) -{ - if (iter->private) - graph_trace_close(iter); -} - -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ - TRACE_GRAPH_PRINT_DURATION) - -static enum print_line_t wakeup_print_line(struct trace_iterator *iter) -{ - /* - * In graph mode call the graph tracer output function, - * otherwise go with the TRACE_FN event handler - */ - if (is_graph()) - return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); - - return TRACE_TYPE_UNHANDLED; -} - -static void wakeup_print_header(struct seq_file *s) -{ - if (is_graph()) - print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); - else - trace_default_header(s); -} - -static void -__trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, - unsigned long flags, int pc) -{ - if (is_graph()) - trace_graph_function(tr, ip, parent_ip, flags, pc); - else - trace_function(tr, ip, parent_ip, flags, pc); -} -#else -#define __trace_function trace_function - -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} - -static enum print_line_t wakeup_print_line(struct trace_iterator *iter) -{ - return TRACE_TYPE_UNHANDLED; -} - -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_trace_open(struct trace_iterator *iter) { } -static void wakeup_trace_close(struct trace_iterator *iter) { } - -#ifdef CONFIG_FUNCTION_TRACER -static void wakeup_print_header(struct seq_file *s) -{ - trace_default_header(s); -} -#else -static void wakeup_print_header(struct seq_file *s) -{ - trace_latency_header(s); -} -#endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -/* - * Should this new latency be reported/recorded? - */ -static int report_latency(cycle_t delta) -{ - if (tracing_thresh) { - if (delta < tracing_thresh) - return 0; - } else { - if (delta <= tracing_max_latency) - return 0; - } - return 1; -} - -static void -probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) -{ - if (task != wakeup_task) - return; - - wakeup_current_cpu = cpu; -} - -static void notrace -probe_wakeup_sched_switch(void *ignore, - struct task_struct *prev, struct task_struct *next) -{ - struct trace_array_cpu *data; - cycle_t T0, T1, delta; - unsigned long flags; - long disabled; - int cpu; - int pc; - - tracing_record_cmdline(prev); - - if (unlikely(!tracer_enabled)) - return; - - /* - * When we start a new trace, we set wakeup_task to NULL - * and then set tracer_enabled = 1. We want to make sure - * that another CPU does not see the tracer_enabled = 1 - * and the wakeup_task with an older task, that might - * actually be the same as next. - */ - smp_rmb(); - - if (next != wakeup_task) - return; - - pc = preempt_count(); - - /* disable local data, not wakeup_cpu data */ - cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); - if (likely(disabled != 1)) - goto out; - - local_irq_save(flags); - arch_spin_lock(&wakeup_lock); - - /* We could race with grabbing wakeup_lock */ - if (unlikely(!tracer_enabled || next != wakeup_task)) - goto out_unlock; - - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - - __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); - tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - - T0 = data->preempt_timestamp; - T1 = ftrace_now(cpu); - delta = T1-T0; - - if (!report_latency(delta)) - goto out_unlock; - - if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); - } - -out_unlock: - __wakeup_reset(wakeup_trace); - arch_spin_unlock(&wakeup_lock); - local_irq_restore(flags); -out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); -} - -static void __wakeup_reset(struct trace_array *tr) -{ - wakeup_cpu = -1; - wakeup_prio = -1; - - if (wakeup_task) - put_task_struct(wakeup_task); - - wakeup_task = NULL; -} - -static void wakeup_reset(struct trace_array *tr) -{ - unsigned long flags; - - tracing_reset_online_cpus(tr); - - local_irq_save(flags); - arch_spin_lock(&wakeup_lock); - __wakeup_reset(tr); - arch_spin_unlock(&wakeup_lock); - local_irq_restore(flags); -} - -static void -probe_wakeup(void *ignore, struct task_struct *p, int success) -{ - struct trace_array_cpu *data; - int cpu = smp_processor_id(); - unsigned long flags; - long disabled; - int pc; - - if (likely(!tracer_enabled)) - return; - - tracing_record_cmdline(p); - tracing_record_cmdline(current); - - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) - return; - - pc = preempt_count(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); - if (unlikely(disabled != 1)) - goto out; - - /* interrupts should be off from try_to_wake_up */ - arch_spin_lock(&wakeup_lock); - - /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) - goto out_locked; - - /* reset the trace */ - __wakeup_reset(wakeup_trace); - - wakeup_cpu = task_cpu(p); - wakeup_current_cpu = wakeup_cpu; - wakeup_prio = p->prio; - - wakeup_task = p; - get_task_struct(wakeup_task); - - local_save_flags(flags); - - data = wakeup_trace->data[wakeup_cpu]; - data->preempt_timestamp = ftrace_now(cpu); - tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); - - /* - * We must be careful in using CALLER_ADDR2. But since wake_up - * is not called by an assembly function (where as schedule is) - * it should be safe to use it here. - */ - __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); - -out_locked: - arch_spin_unlock(&wakeup_lock); -out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); -} - -static void start_wakeup_tracer(struct trace_array *tr) -{ - int ret; - - ret = register_trace_sched_wakeup(probe_wakeup, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't activate tracepoint" - " probe to kernel_sched_wakeup\n"); - return; - } - - ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't activate tracepoint" - " probe to kernel_sched_wakeup_new\n"); - goto fail_deprobe; - } - - ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); - if (ret) { - pr_info("sched trace: Couldn't activate tracepoint" - " probe to kernel_sched_switch\n"); - goto fail_deprobe_wake_new; - } - - ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); - if (ret) { - pr_info("wakeup trace: Couldn't activate tracepoint" - " probe to kernel_sched_migrate_task\n"); - return; - } - - wakeup_reset(tr); - - /* - * Don't let the tracer_enabled = 1 show up before - * the wakeup_task is reset. This may be overkill since - * wakeup_reset does a spin_unlock after setting the - * wakeup_task to NULL, but I want to be safe. - * This is a slow path anyway. - */ - smp_wmb(); - - if (start_func_tracer(is_graph())) - printk(KERN_ERR "failed to start wakeup tracer\n"); - - return; -fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_wakeup, NULL); -fail_deprobe: - unregister_trace_sched_wakeup(probe_wakeup, NULL); -} - -static void stop_wakeup_tracer(struct trace_array *tr) -{ - tracer_enabled = 0; - stop_func_tracer(is_graph()); - unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); - unregister_trace_sched_wakeup_new(probe_wakeup, NULL); - unregister_trace_sched_wakeup(probe_wakeup, NULL); - unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); -} - -static int __wakeup_tracer_init(struct trace_array *tr) -{ - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; - - tracing_max_latency = 0; - wakeup_trace = tr; - start_wakeup_tracer(tr); - return 0; -} - -static int wakeup_tracer_init(struct trace_array *tr) -{ - wakeup_rt = 0; - return __wakeup_tracer_init(tr); -} - -static int wakeup_rt_tracer_init(struct trace_array *tr) -{ - wakeup_rt = 1; - return __wakeup_tracer_init(tr); -} - -static void wakeup_tracer_reset(struct trace_array *tr) -{ - stop_wakeup_tracer(tr); - /* make sure we put back any tasks we are tracing */ - wakeup_reset(tr); - - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; -} - -static void wakeup_tracer_start(struct trace_array *tr) -{ - wakeup_reset(tr); - tracer_enabled = 1; -} - -static void wakeup_tracer_stop(struct trace_array *tr) -{ - tracer_enabled = 0; -} - -static struct tracer wakeup_tracer __read_mostly = -{ - .name = "wakeup", - .init = wakeup_tracer_init, - .reset = wakeup_tracer_reset, - .start = wakeup_tracer_start, - .stop = wakeup_tracer_stop, - .print_max = 1, - .print_header = wakeup_print_header, - .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_wakeup, -#endif - .open = wakeup_trace_open, - .close = wakeup_trace_close, - .use_max_tr = 1, -}; - -static struct tracer wakeup_rt_tracer __read_mostly = -{ - .name = "wakeup_rt", - .init = wakeup_rt_tracer_init, - .reset = wakeup_tracer_reset, - .start = wakeup_tracer_start, - .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, - .print_max = 1, - .print_header = wakeup_print_header, - .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_wakeup, -#endif - .open = wakeup_trace_open, - .close = wakeup_trace_close, - .use_max_tr = 1, -}; - -__init static int init_wakeup_tracer(void) -{ - int ret; - - ret = register_tracer(&wakeup_tracer); - if (ret) - return ret; - - ret = register_tracer(&wakeup_rt_tracer); - if (ret) - return ret; - - return 0; -} -device_initcall(init_wakeup_tracer); -/* Include in trace.c */ - -#include -#include -#include -#include - -static inline int trace_valid_entry(struct trace_entry *entry) -{ - switch (entry->type) { - case TRACE_FN: - case TRACE_CTX: - case TRACE_WAKE: - case TRACE_STACK: - case TRACE_PRINT: - case TRACE_BRANCH: - case TRACE_GRAPH_ENT: - case TRACE_GRAPH_RET: - return 1; - } - return 0; -} - -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) -{ - struct ring_buffer_event *event; - struct trace_entry *entry; - unsigned int loops = 0; - - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { - entry = ring_buffer_event_data(event); - - /* - * The ring buffer is a size of trace_buf_size, if - * we loop more than the size, there's something wrong - * with the ring buffer. - */ - if (loops++ > trace_buf_size) { - printk(KERN_CONT ".. bad ring buffer "); - goto failed; - } - if (!trace_valid_entry(entry)) { - printk(KERN_CONT ".. invalid entry %d ", - entry->type); - goto failed; - } - } - return 0; - - failed: - /* disable tracing */ - tracing_disabled = 1; - printk(KERN_CONT ".. corrupted trace buffer .. "); - return -1; -} - -/* - * Test the trace buffer to see if all the elements - * are still sane. - */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) -{ - unsigned long flags, cnt = 0; - int cpu, ret = 0; - - /* Don't allow flipping of max traces now */ - local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); - - cnt = ring_buffer_entries(tr->buffer); - - /* - * The trace_test_buffer_cpu runs a while loop to consume all data. - * If the calling tracer is broken, and is constantly filling - * the buffer, this will run forever, and hard lock the box. - * We disable the ring buffer while we do this test to prevent - * a hard lock up. - */ - tracing_off(); - for_each_possible_cpu(cpu) { - ret = trace_test_buffer_cpu(tr, cpu); - if (ret) - break; - } - tracing_on(); - arch_spin_unlock(&ftrace_max_lock); - local_irq_restore(flags); - - if (count) - *count = cnt; - - return ret; -} - -static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) -{ - printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", - trace->name, init_ret); -} -#ifdef CONFIG_FUNCTION_TRACER - -#ifdef CONFIG_DYNAMIC_FTRACE - -static int trace_selftest_test_probe1_cnt; -static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) -{ - trace_selftest_test_probe1_cnt++; -} - -static int trace_selftest_test_probe2_cnt; -static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) -{ - trace_selftest_test_probe2_cnt++; -} - -static int trace_selftest_test_probe3_cnt; -static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) -{ - trace_selftest_test_probe3_cnt++; -} - -static int trace_selftest_test_global_cnt; -static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) -{ - trace_selftest_test_global_cnt++; -} - -static int trace_selftest_test_dyn_cnt; -static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) -{ - trace_selftest_test_dyn_cnt++; -} - -static struct ftrace_ops test_probe1 = { - .func = trace_selftest_test_probe1_func, -}; - -static struct ftrace_ops test_probe2 = { - .func = trace_selftest_test_probe2_func, -}; - -static struct ftrace_ops test_probe3 = { - .func = trace_selftest_test_probe3_func, -}; - -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, -}; - -static void print_counts(void) -{ - printk("(%d %d %d %d %d) ", - trace_selftest_test_probe1_cnt, - trace_selftest_test_probe2_cnt, - trace_selftest_test_probe3_cnt, - trace_selftest_test_global_cnt, - trace_selftest_test_dyn_cnt); -} - -static void reset_counts(void) -{ - trace_selftest_test_probe1_cnt = 0; - trace_selftest_test_probe2_cnt = 0; - trace_selftest_test_probe3_cnt = 0; - trace_selftest_test_global_cnt = 0; - trace_selftest_test_dyn_cnt = 0; -} - -static int trace_selftest_ops(int cnt) -{ - int save_ftrace_enabled = ftrace_enabled; - struct ftrace_ops *dyn_ops; - char *func1_name; - char *func2_name; - int len1; - int len2; - int ret = -1; - - printk(KERN_CONT "PASSED\n"); - pr_info("Testing dynamic ftrace ops #%d: ", cnt); - - ftrace_enabled = 1; - reset_counts(); - - /* Handle PPC64 '.' name */ - func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); - func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); - len1 = strlen(func1_name); - len2 = strlen(func2_name); - - /* - * Probe 1 will trace function 1. - * Probe 2 will trace function 2. - * Probe 3 will trace functions 1 and 2. - */ - ftrace_set_filter(&test_probe1, func1_name, len1, 1); - ftrace_set_filter(&test_probe2, func2_name, len2, 1); - ftrace_set_filter(&test_probe3, func1_name, len1, 1); - ftrace_set_filter(&test_probe3, func2_name, len2, 0); - - register_ftrace_function(&test_probe1); - register_ftrace_function(&test_probe2); - register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); - - DYN_FTRACE_TEST_NAME(); - - print_counts(); - - if (trace_selftest_test_probe1_cnt != 1) - goto out; - if (trace_selftest_test_probe2_cnt != 0) - goto out; - if (trace_selftest_test_probe3_cnt != 1) - goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; - - DYN_FTRACE_TEST_NAME2(); - - print_counts(); - - if (trace_selftest_test_probe1_cnt != 1) - goto out; - if (trace_selftest_test_probe2_cnt != 1) - goto out; - if (trace_selftest_test_probe3_cnt != 2) - goto out; - - /* Add a dynamic probe */ - dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); - if (!dyn_ops) { - printk("MEMORY ERROR "); - goto out; - } - - dyn_ops->func = trace_selftest_test_dyn_func; - - register_ftrace_function(dyn_ops); - - trace_selftest_test_global_cnt = 0; - - DYN_FTRACE_TEST_NAME(); - - print_counts(); - - if (trace_selftest_test_probe1_cnt != 2) - goto out_free; - if (trace_selftest_test_probe2_cnt != 1) - goto out_free; - if (trace_selftest_test_probe3_cnt != 3) - goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; - if (trace_selftest_test_dyn_cnt == 0) - goto out_free; - - DYN_FTRACE_TEST_NAME2(); - - print_counts(); - - if (trace_selftest_test_probe1_cnt != 2) - goto out_free; - if (trace_selftest_test_probe2_cnt != 2) - goto out_free; - if (trace_selftest_test_probe3_cnt != 4) - goto out_free; - - ret = 0; - out_free: - unregister_ftrace_function(dyn_ops); - kfree(dyn_ops); - - out: - /* Purposely unregister in the same order */ - unregister_ftrace_function(&test_probe1); - unregister_ftrace_function(&test_probe2); - unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); - - /* Make sure everything is off */ - reset_counts(); - DYN_FTRACE_TEST_NAME(); - DYN_FTRACE_TEST_NAME(); - - if (trace_selftest_test_probe1_cnt || - trace_selftest_test_probe2_cnt || - trace_selftest_test_probe3_cnt || - trace_selftest_test_global_cnt || - trace_selftest_test_dyn_cnt) - ret = -1; - - ftrace_enabled = save_ftrace_enabled; - - return ret; -} - -/* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) -{ - int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; - unsigned long count; - char *func_name; - int ret; - - /* The ftrace test PASSED */ - printk(KERN_CONT "PASSED\n"); - pr_info("Testing dynamic ftrace: "); - - /* enable tracing, and record the filter function */ - ftrace_enabled = 1; - tracer_enabled = 1; - - /* passed in by parameter to fool gcc from optimizing */ - func(); - - /* - * Some archs *cough*PowerPC*cough* add characters to the - * start of the function names. We simply put a '*' to - * accommodate them. - */ - func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); - - /* filter only on our function */ - ftrace_set_global_filter(func_name, strlen(func_name), 1); - - /* enable tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - goto out; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - - /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); - if (ret) - goto out; - - if (count) { - ret = -1; - printk(KERN_CONT ".. filter did not filter .. "); - goto out; - } - - /* call our function again */ - func(); - - /* sleep again */ - msleep(100); - - /* stop the tracing. */ - tracing_stop(); - ftrace_enabled = 0; - - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - tracing_start(); - - /* we should only have one item */ - if (!ret && count != 1) { - trace->reset(tr); - printk(KERN_CONT ".. filter failed count=%ld ..", count); - ret = -1; - goto out; - } - - /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); - trace->reset(tr); - - out: - ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; - - /* Enable tracing on all functions again */ - ftrace_set_global_filter(NULL, 0, 1); - - /* Test the ops with global tracing off */ - if (!ret) - ret = trace_selftest_ops(2); - - return ret; -} -#else -# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) -#endif /* CONFIG_DYNAMIC_FTRACE */ - -/* - * Simple verification test of ftrace function tracer. - * Enable ftrace, sleep 1/10 second, and then read the trace - * buffer to see if all is in order. - */ -int -trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) -{ - int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; - unsigned long count; - int ret; - - /* make sure msleep has been recorded */ - msleep(1); - - /* start the tracing */ - ftrace_enabled = 1; - tracer_enabled = 1; - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - goto out; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - ftrace_enabled = 0; - - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - ret = trace_selftest_startup_dynamic_tracing(trace, tr, - DYN_FTRACE_TEST_NAME); - - out: - ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; - - /* kill ftrace totally if we failed */ - if (ret) - ftrace_kill(); - - return ret; -} -#endif /* CONFIG_FUNCTION_TRACER */ - - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -/* Maximum number of functions to trace before diagnosing a hang */ -#define GRAPH_MAX_FUNC_TEST 100000000 - -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); -static unsigned int graph_hang_thresh; - -/* Wrap the real function entry probe to avoid possible hanging */ -static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) -{ - /* This is harmlessly racy, we want to approximately detect a hang */ - if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { - ftrace_graph_stop(); - printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) - __ftrace_dump(false, DUMP_ALL); - return 0; - } - - return trace_graph_entry(trace); -} - -/* - * Pretty much the same than for the function tracer from which the selftest - * has been borrowed. - */ -int -trace_selftest_startup_function_graph(struct tracer *trace, - struct trace_array *tr) -{ - int ret; - unsigned long count; - - /* - * Simulate the init() callback but we attach a watchdog callback - * to detect and recover from possible hangs - */ - tracing_reset_online_cpus(tr); - set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry_watchdog); - if (ret) { - warn_failed_init_tracer(trace, ret); - goto out; - } - tracing_start_cmdline_record(); - - /* Sleep for a 1/10 of a second */ - msleep(100); - - /* Have we just recovered from a hang? */ - if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { - tracing_selftest_disabled = true; - ret = -1; - goto out; - } - - tracing_stop(); - - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - /* Don't test dynamic tracing, the function tracer already did */ - -out: - /* Stop it if we failed */ - if (ret) - ftrace_graph_stop(); - - return ret; -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - -#ifdef CONFIG_IRQSOFF_TRACER -int -trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* reset the max latency */ - tracing_max_latency = 0; - /* disable interrupts for a bit */ - local_irq_disable(); - udelay(100); - local_irq_enable(); - - /* - * Stop the tracer to avoid a warning subsequent - * to buffer flipping failure because tracing_stop() - * disables the tr and max buffers, making flipping impossible - * in case of parallels max irqs off latencies. - */ - trace->stop(tr); - /* stop the tracing. */ - tracing_stop(); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -int -trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* - * Now that the big kernel lock is no longer preemptable, - * and this is called with the BKL held, it will always - * fail. If preemption is already disabled, simply - * pass the test. When the BKL is removed, or becomes - * preemptible again, we will once again test this, - * so keep it in. - */ - if (preempt_count()) { - printk(KERN_CONT "can not test ... force "); - return 0; - } - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* reset the max latency */ - tracing_max_latency = 0; - /* disable preemption for a bit */ - preempt_disable(); - udelay(100); - preempt_enable(); - - /* - * Stop the tracer to avoid a warning subsequent - * to buffer flipping failure because tracing_stop() - * disables the tr and max buffers, making flipping impossible - * in case of parallels max preempt off latencies. - */ - trace->stop(tr); - /* stop the tracing. */ - tracing_stop(); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_PREEMPT_TRACER */ - -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) -int -trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - unsigned long count; - int ret; - - /* - * Now that the big kernel lock is no longer preemptable, - * and this is called with the BKL held, it will always - * fail. If preemption is already disabled, simply - * pass the test. When the BKL is removed, or becomes - * preemptible again, we will once again test this, - * so keep it in. - */ - if (preempt_count()) { - printk(KERN_CONT "can not test ... force "); - return 0; - } - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - goto out_no_start; - } - - /* reset the max latency */ - tracing_max_latency = 0; - - /* disable preemption and interrupts for a bit */ - preempt_disable(); - local_irq_disable(); - udelay(100); - preempt_enable(); - /* reverse the order of preempt vs irqs */ - local_irq_enable(); - - /* - * Stop the tracer to avoid a warning subsequent - * to buffer flipping failure because tracing_stop() - * disables the tr and max buffers, making flipping impossible - * in case of parallels max irqs/preempt off latencies. - */ - trace->stop(tr); - /* stop the tracing. */ - tracing_stop(); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (ret) - goto out; - - ret = trace_test_buffer(&max_tr, &count); - if (ret) - goto out; - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - - /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; - tracing_start(); - trace->start(tr); - - preempt_disable(); - local_irq_disable(); - udelay(100); - preempt_enable(); - /* reverse the order of preempt vs irqs */ - local_irq_enable(); - - trace->stop(tr); - /* stop the tracing. */ - tracing_stop(); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (ret) - goto out; - - ret = trace_test_buffer(&max_tr, &count); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - goto out; - } - -out: - tracing_start(); -out_no_start: - trace->reset(tr); - tracing_max_latency = save_max; - - return ret; -} -#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ - -#ifdef CONFIG_NOP_TRACER -int -trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) -{ - /* What could possibly go wrong? */ - return 0; -} -#endif - -#ifdef CONFIG_SCHED_TRACER -static int trace_wakeup_test_thread(void *data) -{ - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; - struct completion *x = data; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* Make it know we have a new prio */ - complete(x); - - /* now go to sleep and let the test wake us up */ - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* we are awake, now wait to disappear */ - while (!kthread_should_stop()) { - /* - * This is an RT task, do short sleeps to let - * others run. - */ - msleep(100); - } - - return 0; -} - -int -trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) -{ - unsigned long save_max = tracing_max_latency; - struct task_struct *p; - struct completion isrt; - unsigned long count; - int ret; - - init_completion(&isrt); - - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); - if (IS_ERR(p)) { - printk(KERN_CONT "Failed to create ftrace wakeup test thread "); - return -1; - } - - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* reset the max latency */ - tracing_max_latency = 0; - - /* sleep to let the RT thread sleep too */ - msleep(100); - - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ - - wake_up_process(p); - - /* give a little time to let the thread wake up */ - msleep(100); - - /* stop the tracing. */ - tracing_stop(); - /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); - if (!ret) - ret = trace_test_buffer(&max_tr, &count); - - - trace->reset(tr); - tracing_start(); - - tracing_max_latency = save_max; - - /* kill the thread */ - kthread_stop(p); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SCHED_TRACER */ - -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -int -trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - -#ifdef CONFIG_BRANCH_TRACER -int -trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_BRANCH_TRACER */ - -#include "trace.h" - -int DYN_FTRACE_TEST_NAME(void) -{ - /* used to call mcount */ - return 0; -} - -int DYN_FTRACE_TEST_NAME2(void) -{ - /* used to call mcount */ - return 0; -} -/* - * Copyright (C) 2008 Steven Rostedt - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - -#define STACK_TRACE_ENTRIES 500 - -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = - { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; - -static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = stack_dump_trace, -}; - -static unsigned long max_stack_size; -static arch_spinlock_t max_stack_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - -static int stack_trace_disabled __read_mostly; -static DEFINE_PER_CPU(int, trace_active); -static DEFINE_MUTEX(stack_sysctl_mutex); - -int stack_tracer_enabled; -static int last_stack_tracer_enabled; - -static inline void check_stack(void) -{ - unsigned long this_size, flags; - unsigned long *p, *top, *start; - int i; - - this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); - this_size = THREAD_SIZE - this_size; - - if (this_size <= max_stack_size) - return; - - /* we do not handle interrupt stacks yet */ - if (!object_is_on_stack(&this_size)) - return; - - local_irq_save(flags); - arch_spin_lock(&max_stack_lock); - - /* a race could have already updated it */ - if (this_size <= max_stack_size) - goto out; - - max_stack_size = this_size; - - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; - - save_stack_trace(&max_stack_trace); - - /* - * Now find where in the stack these are. - */ - i = 0; - start = &this_size; - top = (unsigned long *) - (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); - - /* - * Loop through all the entries. One of the entries may - * for some reason be missed on the stack, so we may - * have to account for them. If they are all there, this - * loop will only happen once. This code only takes place - * on a new max, so it is far from a fast path. - */ - while (i < max_stack_trace.nr_entries) { - int found = 0; - - stack_dump_index[i] = this_size; - p = start; - - for (; p < top && i < max_stack_trace.nr_entries; p++) { - if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = - (top - p) * sizeof(unsigned long); - found = 1; - /* Start the search from here */ - start = p + 1; - } - } - - if (!found) - i++; - } - - out: - arch_spin_unlock(&max_stack_lock); - local_irq_restore(flags); -} - -static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) -{ - int cpu; - - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - /* no atomic needed, we only modify this variable by this cpu */ - if (per_cpu(trace_active, cpu)++ != 0) - goto out; - - check_stack(); - - out: - per_cpu(trace_active, cpu)--; - /* prevent recursion in schedule */ - preempt_enable_notrace(); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = stack_trace_call, -}; - -static ssize_t -stack_max_size_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - unsigned long *ptr = filp->private_data; - char buf[64]; - int r; - - r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); - if (r > sizeof(buf)) - r = sizeof(buf); - return simple_read_from_buffer(ubuf, count, ppos, buf, r); -} - -static ssize_t -stack_max_size_write(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - long *ptr = filp->private_data; - unsigned long val, flags; - int ret; - int cpu; - - ret = kstrtoul_from_user(ubuf, count, 10, &val); - if (ret) - return ret; - - local_irq_save(flags); - - /* - * In case we trace inside arch_spin_lock() or after (NMI), - * we will cause circular lock, so we also need to increase - * the percpu trace_active here. - */ - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; - - arch_spin_lock(&max_stack_lock); - *ptr = val; - arch_spin_unlock(&max_stack_lock); - - per_cpu(trace_active, cpu)--; - local_irq_restore(flags); - - return count; -} - -static const struct file_operations stack_max_size_fops = { - .open = tracing_open_generic, - .read = stack_max_size_read, - .write = stack_max_size_write, - .llseek = default_llseek, -}; - -static void * -__next(struct seq_file *m, loff_t *pos) -{ - long n = *pos - 1; - - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) - return NULL; - - m->private = (void *)n; - return &m->private; -} - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return __next(m, pos); -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - int cpu; - - local_irq_disable(); - - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; - - arch_spin_lock(&max_stack_lock); - - if (*pos == 0) - return SEQ_START_TOKEN; - - return __next(m, pos); -} - -static void t_stop(struct seq_file *m, void *p) -{ - int cpu; - - arch_spin_unlock(&max_stack_lock); - - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)--; - - local_irq_enable(); -} - -static int trace_lookup_stack(struct seq_file *m, long i) -{ - unsigned long addr = stack_dump_trace[i]; - - return seq_printf(m, "%pS\n", (void *)addr); -} - -static void print_disabled(struct seq_file *m) -{ - seq_puts(m, "#\n" - "# Stack tracer disabled\n" - "#\n" - "# To enable the stack tracer, either add 'stacktrace' to the\n" - "# kernel command line\n" - "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" - "#\n"); -} - -static int t_show(struct seq_file *m, void *v) -{ - long i; - int size; - - if (v == SEQ_START_TOKEN) { - seq_printf(m, " Depth Size Location" - " (%d entries)\n" - " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); - - if (!stack_tracer_enabled && !max_stack_size) - print_disabled(m); - - return 0; - } - - i = *(long *)v; - - if (i >= max_stack_trace.nr_entries || - stack_dump_trace[i] == ULONG_MAX) - return 0; - - if (i+1 == max_stack_trace.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; - else - size = stack_dump_index[i] - stack_dump_index[i+1]; - - seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); - - trace_lookup_stack(m, i); - - return 0; -} - -static const struct seq_operations stack_trace_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int stack_trace_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &stack_trace_seq_ops); -} - -static const struct file_operations stack_trace_fops = { - .open = stack_trace_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int -stack_trace_filter_open(struct inode *inode, struct file *file) -{ - return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, - inode, file); -} - -static const struct file_operations stack_trace_filter_fops = { - .open = stack_trace_filter_open, - .read = seq_read, - .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, - .release = ftrace_regex_release, -}; - -int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&stack_sysctl_mutex); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (ret || !write || - (last_stack_tracer_enabled == !!stack_tracer_enabled)) - goto out; - - last_stack_tracer_enabled = !!stack_tracer_enabled; - - if (stack_tracer_enabled) - register_ftrace_function(&trace_ops); - else - unregister_ftrace_function(&trace_ops); - - out: - mutex_unlock(&stack_sysctl_mutex); - return ret; -} - -static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; - -static __init int enable_stacktrace(char *str) -{ - if (strncmp(str, "_filter=", 8) == 0) - strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); - - stack_tracer_enabled = 1; - last_stack_tracer_enabled = 1; - return 1; -} -__setup("stacktrace", enable_stacktrace); - -static __init int stack_trace_init(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - - trace_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - - trace_create_file("stack_trace_filter", 0444, d_tracer, - NULL, &stack_trace_filter_fops); - - if (stack_trace_filter_buf[0]) - ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); - - if (stack_tracer_enabled) - register_ftrace_function(&trace_ops); - - return 0; -} - -device_initcall(stack_trace_init); -/* - * Infrastructure for statistic tracing (histogram output). - * - * Copyright (C) 2008-2009 Frederic Weisbecker - * - * Based on the code from trace_branch.c which is - * Copyright (C) 2008 Steven Rostedt - * - */ - - -#include -#include -#include -#include -#include "trace_stat.h" -#include "trace.h" - - -/* - * List of stat red-black nodes from a tracer - * We use a such tree to sort quickly the stat - * entries from the tracer. - */ -struct stat_node { - struct rb_node node; - void *stat; -}; - -/* A stat session is the stats output in one file */ -struct stat_session { - struct list_head session_list; - struct tracer_stat *ts; - struct rb_root stat_root; - struct mutex stat_mutex; - struct dentry *file; -}; - -/* All of the sessions currently in use. Each stat file embed one session */ -static LIST_HEAD(all_stat_sessions); -static DEFINE_MUTEX(all_stat_sessions_mutex); - -/* The root directory for all stat files */ -static struct dentry *stat_dir; - -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, - struct rb_node *node) -{ - struct stat_node *snode; - struct rb_node *parent = rb_parent(node); - - if (node->rb_left) - return node->rb_left; - else if (node->rb_right) - return node->rb_right; - else { - if (!parent) - ; - else if (parent->rb_left == node) - parent->rb_left = NULL; - else - parent->rb_right = NULL; - - snode = container_of(node, struct stat_node, node); - if (ts->stat_release) - ts->stat_release(snode->stat); - kfree(snode); - - return parent; - } -} - -static void __reset_stat_session(struct stat_session *session) -{ - struct rb_node *node = session->stat_root.rb_node; - - while (node) - node = release_next(session->ts, node); - - session->stat_root = RB_ROOT; -} - -static void reset_stat_session(struct stat_session *session) -{ - mutex_lock(&session->stat_mutex); - __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); -} - -static void destroy_session(struct stat_session *session) -{ - debugfs_remove(session->file); - __reset_stat_session(session); - mutex_destroy(&session->stat_mutex); - kfree(session); -} - -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct stat_node *data; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->stat = stat; - - /* - * Figure out where to put new node - * This is a descendent sorting - */ - while (*new) { - struct stat_node *this; - int result; - - this = container_of(*new, struct stat_node, node); - result = cmp(data->stat, this->stat); - - parent = *new; - if (result >= 0) - new = &((*new)->rb_left); - else - new = &((*new)->rb_right); - } - - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); - return 0; -} - -/* - * For tracers that don't provide a stat_cmp callback. - * This one will force an insertion as right-most node - * in the rbtree. - */ -static int dummy_cmp(void *p1, void *p2) -{ - return -1; -} - -/* - * Initialize the stat rbtree at each trace_stat file opening. - * All of these copies and sorting are required on all opening - * since the stats could have changed between two file sessions. - */ -static int stat_seq_init(struct stat_session *session) -{ - struct tracer_stat *ts = session->ts; - struct rb_root *root = &session->stat_root; - void *stat; - int ret = 0; - int i; - - mutex_lock(&session->stat_mutex); - __reset_stat_session(session); - - if (!ts->stat_cmp) - ts->stat_cmp = dummy_cmp; - - stat = ts->stat_start(ts); - if (!stat) - goto exit; - - ret = insert_stat(root, stat, ts->stat_cmp); - if (ret) - goto exit; - - /* - * Iterate over the tracer stat entries and store them in an rbtree. - */ - for (i = 1; ; i++) { - stat = ts->stat_next(stat, i); - - /* End of insertion */ - if (!stat) - break; - - ret = insert_stat(root, stat, ts->stat_cmp); - if (ret) - goto exit_free_rbtree; - } - -exit: - mutex_unlock(&session->stat_mutex); - return ret; - -exit_free_rbtree: - __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return ret; -} - - -static void *stat_seq_start(struct seq_file *s, loff_t *pos) -{ - struct stat_session *session = s->private; - struct rb_node *node; - int n = *pos; - int i; - - /* Prevent from tracer switch or rbtree modification */ - mutex_lock(&session->stat_mutex); - - /* If we are in the beginning of the file, print the headers */ - if (session->ts->stat_headers) { - if (n == 0) - return SEQ_START_TOKEN; - n--; - } - - node = rb_first(&session->stat_root); - for (i = 0; node && i < n; i++) - node = rb_next(node); - - return node; -} - -static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) -{ - struct stat_session *session = s->private; - struct rb_node *node = p; - - (*pos)++; - - if (p == SEQ_START_TOKEN) - return rb_first(&session->stat_root); - - return rb_next(node); -} - -static void stat_seq_stop(struct seq_file *s, void *p) -{ - struct stat_session *session = s->private; - mutex_unlock(&session->stat_mutex); -} - -static int stat_seq_show(struct seq_file *s, void *v) -{ - struct stat_session *session = s->private; - struct stat_node *l = container_of(v, struct stat_node, node); - - if (v == SEQ_START_TOKEN) - return session->ts->stat_headers(s); - - return session->ts->stat_show(s, l->stat); -} - -static const struct seq_operations trace_stat_seq_ops = { - .start = stat_seq_start, - .next = stat_seq_next, - .stop = stat_seq_stop, - .show = stat_seq_show -}; - -/* The session stat is refilled and resorted at each stat file opening */ -static int tracing_stat_open(struct inode *inode, struct file *file) -{ - int ret; - struct seq_file *m; - struct stat_session *session = inode->i_private; - - ret = stat_seq_init(session); - if (ret) - return ret; - - ret = seq_open(file, &trace_stat_seq_ops); - if (ret) { - reset_stat_session(session); - return ret; - } - - m = file->private_data; - m->private = session; - return ret; -} - -/* - * Avoid consuming memory with our now useless rbtree. - */ -static int tracing_stat_release(struct inode *i, struct file *f) -{ - struct stat_session *session = i->i_private; - - reset_stat_session(session); - - return seq_release(i, f); -} - -static const struct file_operations tracing_stat_fops = { - .open = tracing_stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_stat_release -}; - -static int tracing_stat_init(void) -{ - struct dentry *d_tracing; - - d_tracing = tracing_init_dentry(); - - stat_dir = debugfs_create_dir("trace_stat", d_tracing); - if (!stat_dir) - pr_warning("Could not create debugfs " - "'trace_stat' entry\n"); - return 0; -} - -static int init_stat_file(struct stat_session *session) -{ - if (!stat_dir && tracing_stat_init()) - return -ENODEV; - - session->file = debugfs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); - if (!session->file) - return -ENOMEM; - return 0; -} - -int register_stat_tracer(struct tracer_stat *trace) -{ - struct stat_session *session, *node; - int ret; - - if (!trace) - return -EINVAL; - - if (!trace->stat_start || !trace->stat_next || !trace->stat_show) - return -EINVAL; - - /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry(node, &all_stat_sessions, session_list) { - if (node->ts == trace) { - mutex_unlock(&all_stat_sessions_mutex); - return -EINVAL; - } - } - mutex_unlock(&all_stat_sessions_mutex); - - /* Init the session */ - session = kzalloc(sizeof(*session), GFP_KERNEL); - if (!session) - return -ENOMEM; - - session->ts = trace; - INIT_LIST_HEAD(&session->session_list); - mutex_init(&session->stat_mutex); - - ret = init_stat_file(session); - if (ret) { - destroy_session(session); - return ret; - } - - /* Register */ - mutex_lock(&all_stat_sessions_mutex); - list_add_tail(&session->session_list, &all_stat_sessions); - mutex_unlock(&all_stat_sessions_mutex); - - return 0; -} - -void unregister_stat_tracer(struct tracer_stat *trace) -{ - struct stat_session *node, *tmp; - - mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { - if (node->ts == trace) { - list_del(&node->session_list); - destroy_session(node); - break; - } - } - mutex_unlock(&all_stat_sessions_mutex); -} -#include -#include -#include -#include -#include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ -#include -#include -#include - -#include "trace_output.h" -#include "trace.h" - -static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); - -static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); -static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); - -static int syscall_enter_define_fields(struct ftrace_event_call *call); -static int syscall_exit_define_fields(struct ftrace_event_call *call); - -static struct list_head * -syscall_get_enter_fields(struct ftrace_event_call *call) -{ - struct syscall_metadata *entry = call->data; - - return &entry->enter_fields; -} - -struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, -}; - -struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, -}; - -struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, -}; - -struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), - .raw_init = init_syscall_trace, -}; - -extern struct syscall_metadata *__start_syscalls_metadata[]; -extern struct syscall_metadata *__stop_syscalls_metadata[]; - -static struct syscall_metadata **syscalls_metadata; - -#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME -static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) -{ - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - return !strcmp(sym + 3, name + 3); -} -#endif - -static __init struct syscall_metadata * -find_syscall_meta(unsigned long syscall) -{ - struct syscall_metadata **start; - struct syscall_metadata **stop; - char str[KSYM_SYMBOL_LEN]; - - - start = __start_syscalls_metadata; - stop = __stop_syscalls_metadata; - kallsyms_lookup(syscall, NULL, NULL, NULL, str); - - if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) - return NULL; - - for ( ; start < stop; start++) { - if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) - return *start; - } - return NULL; -} - -static struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -enum print_line_t -print_syscall_enter(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *ent = iter->ent; - struct syscall_trace_enter *trace; - struct syscall_metadata *entry; - int i, ret, syscall; - - trace = (typeof(trace))ent; - syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); - - if (!entry) - goto end; - - if (entry->enter_event->event.type != ent->type) { - WARN_ON_ONCE(1); - goto end; - } - - ret = trace_seq_printf(s, "%s(", entry->name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - for (i = 0; i < entry->nb_args; i++) { - /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) { - ret = trace_seq_printf(s, "%s ", entry->types[i]); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = trace_seq_putc(s, ')'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - -end: - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -enum print_line_t -print_syscall_exit(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *ent = iter->ent; - struct syscall_trace_exit *trace; - int syscall; - struct syscall_metadata *entry; - int ret; - - trace = (typeof(trace))ent; - syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); - - if (!entry) { - trace_seq_printf(s, "\n"); - return TRACE_TYPE_HANDLED; - } - - if (entry->exit_event->event.type != ent->type) { - WARN_ON_ONCE(1); - return TRACE_TYPE_UNHANDLED; - } - - ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, - trace->ret); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -extern char *__bad_type_size(void); - -#define SYSCALL_FIELD(type, name) \ - sizeof(type) != sizeof(trace.name) ? \ - __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), \ - sizeof(trace.name), is_signed_type(type) - -static -int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) -{ - int i; - int pos = 0; - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", - entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - - for (i = 0; i < entry->nb_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", ((unsigned long)(REC->%s))", entry->args[i]); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_syscall_print_fmt(struct ftrace_event_call *call) -{ - char *print_fmt; - int len; - struct syscall_metadata *entry = call->data; - - if (entry->enter_event != call) { - call->print_fmt = "\"0x%lx\", REC->ret"; - return 0; - } - - /* First: called with 0 length to calculate the needed length */ - len = __set_enter_print_fmt(entry, NULL, 0); - - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_enter_print_fmt(entry, print_fmt, len + 1); - call->print_fmt = print_fmt; - - return 0; -} - -static void free_syscall_print_fmt(struct ftrace_event_call *call) -{ - struct syscall_metadata *entry = call->data; - - if (entry->enter_event == call) - kfree(call->print_fmt); -} - -static int syscall_enter_define_fields(struct ftrace_event_call *call) -{ - struct syscall_trace_enter trace; - struct syscall_metadata *meta = call->data; - int ret; - int i; - int offset = offsetof(typeof(trace), args); - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); - if (ret) - return ret; - - for (i = 0; i < meta->nb_args; i++) { - ret = trace_define_field(call, meta->types[i], - meta->args[i], offset, - sizeof(unsigned long), 0, - FILTER_OTHER); - offset += sizeof(unsigned long); - } - - return ret; -} - -static int syscall_exit_define_fields(struct ftrace_event_call *call) -{ - struct syscall_trace_exit trace; - int ret; - - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); - if (ret) - return ret; - - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), - FILTER_OTHER); - - return ret; -} - -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) -{ - struct syscall_trace_enter *entry; - struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - int size; - int syscall_nr; - - syscall_nr = syscall_get_nr(current, regs); - if (syscall_nr < 0) - return; - if (!test_bit(syscall_nr, enabled_enter_syscalls)) - return; - - sys_data = syscall_nr_to_meta(syscall_nr); - if (!sys_data) - return; - - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - - event = trace_current_buffer_lock_reserve(&buffer, - sys_data->enter_event->event.type, size, 0, 0); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - - if (!filter_current_check_discard(buffer, sys_data->enter_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); -} - -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) -{ - struct syscall_trace_exit *entry; - struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - int syscall_nr; - - syscall_nr = syscall_get_nr(current, regs); - if (syscall_nr < 0) - return; - if (!test_bit(syscall_nr, enabled_exit_syscalls)) - return; - - sys_data = syscall_nr_to_meta(syscall_nr); - if (!sys_data) - return; - - event = trace_current_buffer_lock_reserve(&buffer, - sys_data->exit_event->event.type, sizeof(*entry), 0, 0); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->nr = syscall_nr; - entry->ret = syscall_get_return_value(current, regs); - - if (!filter_current_check_discard(buffer, sys_data->exit_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); -} - -int reg_event_syscall_enter(struct ftrace_event_call *call) -{ - int ret = 0; - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) - return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); - if (!ret) { - set_bit(num, enabled_enter_syscalls); - sys_refcount_enter++; - } - mutex_unlock(&syscall_trace_lock); - return ret; -} - -void unreg_event_syscall_enter(struct ftrace_event_call *call) -{ - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) - return; - mutex_lock(&syscall_trace_lock); - sys_refcount_enter--; - clear_bit(num, enabled_enter_syscalls); - if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); -} - -int reg_event_syscall_exit(struct ftrace_event_call *call) -{ - int ret = 0; - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) - return -ENOSYS; - mutex_lock(&syscall_trace_lock); - if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); - if (!ret) { - set_bit(num, enabled_exit_syscalls); - sys_refcount_exit++; - } - mutex_unlock(&syscall_trace_lock); - return ret; -} - -void unreg_event_syscall_exit(struct ftrace_event_call *call) -{ - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) - return; - mutex_lock(&syscall_trace_lock); - sys_refcount_exit--; - clear_bit(num, enabled_exit_syscalls); - if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); -} - -int init_syscall_trace(struct ftrace_event_call *call) -{ - int id; - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) { - pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", - ((struct syscall_metadata *)call->data)->name); - return -ENOSYS; - } - - if (set_syscall_print_fmt(call) < 0) - return -ENOMEM; - - id = trace_event_raw_init(call); - - if (id < 0) { - free_syscall_print_fmt(call); - return id; - } - - return id; -} - -unsigned long __init __weak arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} - -int __init init_ftrace_syscalls(void) -{ - struct syscall_metadata *meta; - unsigned long addr; - int i; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - addr = arch_syscall_addr(i); - meta = find_syscall_meta(addr); - if (!meta) - continue; - - meta->syscall_nr = i; - syscalls_metadata[i] = meta; - } - - return 0; -} -core_initcall(init_ftrace_syscalls); - -#ifdef CONFIG_PERF_EVENTS - -static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); -static int sys_perf_refcount_enter; -static int sys_perf_refcount_exit; - -static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) -{ - struct syscall_metadata *sys_data; - struct syscall_trace_enter *rec; - struct hlist_head *head; - int syscall_nr; - int rctx; - int size; - - syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) - return; - - sys_data = syscall_nr_to_meta(syscall_nr); - if (!sys_data) - return; - - /* get the size after alignment with the u32 buffer size field */ - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); - size = ALIGN(size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); - - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) - return; - - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, regs, &rctx); - if (!rec) - return; - - rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); - - head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); -} - -int perf_sysenter_enable(struct ftrace_event_call *call) -{ - int ret = 0; - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_enter) - ret = register_trace_sys_enter(perf_syscall_enter, NULL); - if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); - } else { - set_bit(num, enabled_perf_enter_syscalls); - sys_perf_refcount_enter++; - } - mutex_unlock(&syscall_trace_lock); - return ret; -} - -void perf_sysenter_disable(struct ftrace_event_call *call) -{ - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - - mutex_lock(&syscall_trace_lock); - sys_perf_refcount_enter--; - clear_bit(num, enabled_perf_enter_syscalls); - if (!sys_perf_refcount_enter) - unregister_trace_sys_enter(perf_syscall_enter, NULL); - mutex_unlock(&syscall_trace_lock); -} - -static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) -{ - struct syscall_metadata *sys_data; - struct syscall_trace_exit *rec; - struct hlist_head *head; - int syscall_nr; - int rctx; - int size; - - syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) - return; - - sys_data = syscall_nr_to_meta(syscall_nr); - if (!sys_data) - return; - - /* We can probably do that at build time */ - size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); - - /* - * Impossible, but be paranoid with the future - * How to put this check outside runtime? - */ - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "exit event has grown above perf buffer size")) - return; - - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, regs, &rctx); - if (!rec) - return; - - rec->nr = syscall_nr; - rec->ret = syscall_get_return_value(current, regs); - - head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); -} - -int perf_sysexit_enable(struct ftrace_event_call *call) -{ - int ret = 0; - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - - mutex_lock(&syscall_trace_lock); - if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit, NULL); - if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); - } else { - set_bit(num, enabled_perf_exit_syscalls); - sys_perf_refcount_exit++; - } - mutex_unlock(&syscall_trace_lock); - return ret; -} - -void perf_sysexit_disable(struct ftrace_event_call *call) -{ - int num; - - num = ((struct syscall_metadata *)call->data)->syscall_nr; - - mutex_lock(&syscall_trace_lock); - sys_perf_refcount_exit--; - clear_bit(num, enabled_perf_exit_syscalls); - if (!sys_perf_refcount_exit) - unregister_trace_sys_exit(perf_syscall_exit, NULL); - mutex_unlock(&syscall_trace_lock); -} - -#endif /* CONFIG_PERF_EVENTS */ - -static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) -{ - switch (type) { - case TRACE_REG_REGISTER: - return reg_event_syscall_enter(event); - case TRACE_REG_UNREGISTER: - unreg_event_syscall_enter(event); - return 0; - -#ifdef CONFIG_PERF_EVENTS - case TRACE_REG_PERF_REGISTER: - return perf_sysenter_enable(event); - case TRACE_REG_PERF_UNREGISTER: - perf_sysenter_disable(event); - return 0; -#endif - } - return 0; -} - -static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) -{ - switch (type) { - case TRACE_REG_REGISTER: - return reg_event_syscall_exit(event); - case TRACE_REG_UNREGISTER: - unreg_event_syscall_exit(event); - return 0; - -#ifdef CONFIG_PERF_EVENTS - case TRACE_REG_PERF_REGISTER: - return perf_sysexit_enable(event); - case TRACE_REG_PERF_UNREGISTER: - perf_sysexit_disable(event); - return 0; -#endif - } - return 0; -} -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker - * - */ - - -#include -#include -#include -#include -#include -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { - struct list_head list; - struct kref kref; - int cpu; - pid_t pid; -/* Can be inserted from interrupt or user context, need to be atomic */ - atomic_t inserted; -/* - * Don't need to be atomic, works are serialized in a single workqueue thread - * on a single CPU. - */ - unsigned int executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { - struct list_head list; - spinlock_t lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ - kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - atomic_inc(&node->inserted); - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - node->executed++; - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, - struct task_struct *wq_thread, int cpu) -{ - struct cpu_workqueue_stats *cws; - unsigned long flags; - - WARN_ON(cpu < 0); - - /* Workqueues are sometimes created in atomic context */ - cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); - if (!cws) { - pr_warning("trace_workqueue: not enough memory\n"); - return; - } - INIT_LIST_HEAD(&cws->list); - kref_init(&cws->kref); - cws->cpu = cpu; - cws->pid = wq_thread->pid; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ - /* Workqueue only execute on one cpu */ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { - if (node->pid == wq_thread->pid) { - list_del(&node->list); - kref_put(&node->kref, cpu_workqueue_stat_free); - goto found; - } - } - - pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ - unsigned long flags; - struct cpu_workqueue_stats *ret = NULL; - - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { - ret = list_entry(workqueue_cpu_stat(cpu)->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ - int cpu; - void *ret = NULL; - - for_each_possible_cpu(cpu) { - ret = workqueue_stat_start_cpu(cpu); - if (ret) - return ret; - } - return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ - struct cpu_workqueue_stats *prev_cws = prev; - struct cpu_workqueue_stats *ret; - int cpu = prev_cws->cpu; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - do { - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - return NULL; - } while (!(ret = workqueue_stat_start_cpu(cpu))); - return ret; - } else { - ret = list_entry(prev_cws->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ - struct cpu_workqueue_stats *cws = p; - struct pid *pid; - struct task_struct *tsk; - - pid = find_get_pid(cws->pid); - if (pid) { - tsk = get_pid_task(pid, PIDTYPE_PID); - if (tsk) { - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), cws->executed, - tsk->comm); - put_task_struct(tsk); - } - put_pid(pid); - } - - return 0; -} - -static void workqueue_stat_release(void *stat) -{ - struct cpu_workqueue_stats *node = stat; - - kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ - seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n"); - return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { - .name = "workqueues", - .stat_start = workqueue_stat_start, - .stat_next = workqueue_stat_next, - .stat_show = workqueue_stat_show, - .stat_release = workqueue_stat_release, - .stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ - if (register_stat_tracer(&workqueue_stats)) { - pr_warning("Unable to register workqueue stat tracer\n"); - return 1; - } - - return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ - int ret, cpu; - - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - - ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); - if (ret) - goto out; - - ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); - if (ret) - goto no_insertion; - - ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); - if (ret) - goto no_execution; - - ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); - if (ret) - goto no_creation; - - return 0; - -no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: - pr_warning("trace_workqueue: unable to trace workqueues\n"); - - return 1; -} -early_initcall(trace_workqueue_early_init); -/* - * Copyright (C) 2008 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern struct tracepoint * const __start___tracepoints_ptrs[]; -extern struct tracepoint * const __stop___tracepoints_ptrs[]; - -/* Set to 1 to enable tracepoint debug output */ -static const int tracepoint_debug; - -/* - * Tracepoints mutex protects the builtin and module tracepoints and the hash - * table, as well as the local module list. - */ -static DEFINE_MUTEX(tracepoints_mutex); - -#ifdef CONFIG_MODULES -/* Local list of struct module */ -static LIST_HEAD(tracepoint_module_list); -#endif /* CONFIG_MODULES */ - -/* - * Tracepoint hash table, containing the active tracepoints. - * Protected by tracepoints_mutex. - */ -#define TRACEPOINT_HASH_BITS 6 -#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; - -/* - * Note about RCU : - * It is used to delay the free of multiple probes array until a quiescent - * state is reached. - * Tracepoint entries modifications are protected by the tracepoints_mutex. - */ -struct tracepoint_entry { - struct hlist_node hlist; - struct tracepoint_func *funcs; - int refcount; /* Number of times armed. 0 if disarmed. */ - char name[0]; -}; - -struct tp_probes { - union { - struct rcu_head rcu; - struct list_head list; - } u; - struct tracepoint_func probes[0]; -}; - -static inline void *allocate_probes(int count) -{ - struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) - + sizeof(struct tp_probes), GFP_KERNEL); - return p == NULL ? NULL : p->probes; -} - -static void rcu_free_old_probes(struct rcu_head *head) -{ - kfree(container_of(head, struct tp_probes, u.rcu)); -} - -static inline void release_probes(struct tracepoint_func *old) -{ - if (old) { - struct tp_probes *tp_probes = container_of(old, - struct tp_probes, probes[0]); - call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); - } -} - -static void debug_print_probes(struct tracepoint_entry *entry) -{ - int i; - - if (!tracepoint_debug || !entry->funcs) - return; - - for (i = 0; entry->funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); -} - -static struct tracepoint_func * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, - void *probe, void *data) -{ - int nr_probes = 0; - struct tracepoint_func *old, *new; - - WARN_ON(!probe); - - debug_print_probes(entry); - old = entry->funcs; - if (old) { - /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe && - old[nr_probes].data == data) - return ERR_PTR(-EEXIST); - } - /* + 2 : one for new probe, one for NULL func */ - new = allocate_probes(nr_probes + 2); - if (new == NULL) - return ERR_PTR(-ENOMEM); - if (old) - memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes].func = probe; - new[nr_probes].data = data; - new[nr_probes + 1].func = NULL; - entry->refcount = nr_probes + 1; - entry->funcs = new; - debug_print_probes(entry); - return old; -} - -static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, - void *probe, void *data) -{ - int nr_probes = 0, nr_del = 0, i; - struct tracepoint_func *old, *new; - - old = entry->funcs; - - if (!old) - return ERR_PTR(-ENOENT); - - debug_print_probes(entry); - /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (!probe || - (old[nr_probes].func == probe && - old[nr_probes].data == data)) - nr_del++; - } - - if (nr_probes - nr_del == 0) { - /* N -> 0, (N > 1) */ - entry->funcs = NULL; - entry->refcount = 0; - debug_print_probes(entry); - return old; - } else { - int j = 0; - /* N -> M, (N > 1, M > 0) */ - /* + 1 for NULL */ - new = allocate_probes(nr_probes - nr_del + 1); - if (new == NULL) - return ERR_PTR(-ENOMEM); - for (i = 0; old[i].func; i++) - if (probe && - (old[i].func != probe || old[i].data != data)) - new[j++] = old[i]; - new[nr_probes - nr_del].func = NULL; - entry->refcount = nr_probes - nr_del; - entry->funcs = new; - } - debug_print_probes(entry); - return old; -} - -/* - * Get tracepoint if the tracepoint is present in the tracepoint hash table. - * Must be called with tracepoints_mutex held. - * Returns NULL if not present. - */ -static struct tracepoint_entry *get_tracepoint(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} - -/* - * Add the tracepoint to the tracepoint hash table. Must be called with - * tracepoints_mutex held. - */ -static struct tracepoint_entry *add_tracepoint(const char *name) -{ - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "tracepoint %s busy\n", name); - return ERR_PTR(-EEXIST); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - e->funcs = NULL; - e->refcount = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the tracepoint from the tracepoint hash table. Must be called with - * mutex_lock held. - */ -static inline void remove_tracepoint(struct tracepoint_entry *e) -{ - hlist_del(&e->hlist); - kfree(e); -} - -/* - * Sets the probe callback corresponding to one tracepoint. - */ -static void set_tracepoint(struct tracepoint_entry **entry, - struct tracepoint *elem, int active) -{ - WARN_ON(strcmp((*entry)->name, elem->name) != 0); - - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) - elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) - elem->unregfunc(); - - /* - * rcu_assign_pointer has a smp_wmb() which makes sure that the new - * probe callbacks array is consistent before setting a pointer to it. - * This array is referenced by __DO_TRACE from - * include/linux/tracepoints.h. A matching smp_read_barrier_depends() - * is used. - */ - rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); -} - -/* - * Disable a tracepoint and its probe callback. - * Note: only waiting an RCU period after setting elem->call to the empty - * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. - */ -static void disable_tracepoint(struct tracepoint *elem) -{ - if (elem->unregfunc && jump_label_enabled(&elem->key)) - elem->unregfunc(); - - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); - rcu_assign_pointer(elem->funcs, NULL); -} - -/** - * tracepoint_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of tracepoints. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) -{ - struct tracepoint * const *iter; - struct tracepoint_entry *mark_entry; - - if (!begin) - return; - - for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint((*iter)->name); - if (mark_entry) { - set_tracepoint(&mark_entry, *iter, - !!mark_entry->refcount); - } else { - disable_tracepoint(*iter); - } - } -} - -#ifdef CONFIG_MODULES -void module_update_tracepoints(void) -{ - struct tp_module *tp_mod; - - list_for_each_entry(tp_mod, &tracepoint_module_list, list) - tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, - tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); -} -#else /* CONFIG_MODULES */ -void module_update_tracepoints(void) -{ -} -#endif /* CONFIG_MODULES */ - - -/* - * Update probes, removing the faulty probes. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probes(void) -{ - /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - /* tracepoints in modules. */ - module_update_tracepoints(); -} - -static struct tracepoint_func * -tracepoint_add_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) - return (struct tracepoint_func *)entry; - } - old = tracepoint_entry_add_probe(entry, probe, data); - if (IS_ERR(old) && !entry->refcount) - remove_tracepoint(entry); - return old; -} - -/** - * tracepoint_probe_register - Connect a probe to a tracepoint - * @name: tracepoint name - * @probe: probe handler - * - * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. - */ -int tracepoint_probe_register(const char *name, void *probe, void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ - mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_register); - -static struct tracepoint_func * -tracepoint_remove_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) - return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe, data); - if (IS_ERR(old)) - return old; - if (!entry->refcount) - remove_tracepoint(entry); - return old; -} - -/** - * tracepoint_probe_unregister - Disconnect a probe from a tracepoint - * @name: tracepoint name - * @probe: probe function pointer - * - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. - */ -int tracepoint_probe_unregister(const char *name, void *probe, void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ - mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); - -static LIST_HEAD(old_probes); -static int need_update; - -static void tracepoint_add_old_probes(void *old) -{ - need_update = 1; - if (old) { - struct tp_probes *tp_probes = container_of(old, - struct tp_probes, probes[0]); - list_add(&tp_probes->u.list, &old_probes); - } -} - -/** - * tracepoint_probe_register_noupdate - register a probe but not connect - * @name: tracepoint name - * @probe: probe handler - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_register_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); - -/** - * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect - * @name: tracepoint name - * @probe: probe function pointer - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); - -/** - * tracepoint_probe_update_all - update tracepoints - */ -void tracepoint_probe_update_all(void) -{ - LIST_HEAD(release_probes); - struct tp_probes *pos, *next; - - mutex_lock(&tracepoints_mutex); - if (!need_update) { - mutex_unlock(&tracepoints_mutex); - return; - } - if (!list_empty(&old_probes)) - list_replace_init(&old_probes, &release_probes); - need_update = 0; - tracepoint_update_probes(); - mutex_unlock(&tracepoints_mutex); - list_for_each_entry_safe(pos, next, &release_probes, u.list) { - list_del(&pos->u.list); - call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); - } -} -EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); - -/** - * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. - * @tracepoint: current tracepoints (in), next tracepoint (out) - * @begin: beginning of the range - * @end: end of the range - * - * Returns whether a next tracepoint has been found (1) or not (0). - * Will return the first tracepoint in the range if the input tracepoint is - * NULL. - */ -static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end) -{ - if (!*tracepoint && begin != end) { - *tracepoint = begin; - return 1; - } - if (*tracepoint >= begin && *tracepoint < end) - return 1; - return 0; -} - -#ifdef CONFIG_MODULES -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - struct tp_module *iter_mod; - - /* Core kernel tracepoints */ - if (!iter->module) { - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (found) - goto end; - } - /* Tracepoints in modules */ - mutex_lock(&tracepoints_mutex); - list_for_each_entry(iter_mod, &tracepoint_module_list, list) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - mutex_unlock(&tracepoints_mutex); -end: - if (!found) - tracepoint_iter_reset(iter); -} -#else /* CONFIG_MODULES */ -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - - /* Core kernel tracepoints */ - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (!found) - tracepoint_iter_reset(iter); -} -#endif /* CONFIG_MODULES */ - -void tracepoint_iter_start(struct tracepoint_iter *iter) -{ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_start); - -void tracepoint_iter_next(struct tracepoint_iter *iter) -{ - iter->tracepoint++; - /* - * iter->tracepoint may be invalid because we blindly incremented it. - * Make sure it is valid by marshalling on the tracepoints, getting the - * tracepoints from following modules if necessary. - */ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_next); - -void tracepoint_iter_stop(struct tracepoint_iter *iter) -{ -} -EXPORT_SYMBOL_GPL(tracepoint_iter_stop); - -void tracepoint_iter_reset(struct tracepoint_iter *iter) -{ -#ifdef CONFIG_MODULES - iter->module = NULL; -#endif /* CONFIG_MODULES */ - iter->tracepoint = NULL; -} -EXPORT_SYMBOL_GPL(tracepoint_iter_reset); - -#ifdef CONFIG_MODULES -static int tracepoint_module_coming(struct module *mod) -{ - struct tp_module *tp_mod, *iter; - int ret = 0; - - /* - * We skip modules that taint the kernel, especially those with different - * module headers (for forced load), to make sure we don't cause a crash. - * Staging and out-of-tree GPL modules are fine. - */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) - return 0; - mutex_lock(&tracepoints_mutex); - tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); - if (!tp_mod) { - ret = -ENOMEM; - goto end; - } - tp_mod->num_tracepoints = mod->num_tracepoints; - tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; - - /* - * tracepoint_module_list is kept sorted by struct module pointer - * address for iteration on tracepoints from a seq_file that can release - * the mutex between calls. - */ - list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { - BUG_ON(iter == tp_mod); /* Should never be in the list twice */ - if (iter < tp_mod) { - /* We belong to the location right after iter. */ - list_add(&tp_mod->list, &iter->list); - goto module_added; - } - } - /* We belong to the beginning of the list */ - list_add(&tp_mod->list, &tracepoint_module_list); -module_added: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); -end: - mutex_unlock(&tracepoints_mutex); - return ret; -} - -static int tracepoint_module_going(struct module *mod) -{ - struct tp_module *pos; - - mutex_lock(&tracepoints_mutex); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - list_for_each_entry(pos, &tracepoint_module_list, list) { - if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { - list_del(&pos->list); - kfree(pos); - break; - } - } - /* - * In the case of modules that were tainted at "coming", we'll simply - * walk through the list without finding it. We cannot use the "tainted" - * flag on "going", in case a module taints the kernel only after being - * loaded. - */ - mutex_unlock(&tracepoints_mutex); - return 0; -} - -int tracepoint_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - int ret = 0; - - switch (val) { - case MODULE_STATE_COMING: - ret = tracepoint_module_coming(mod); - break; - case MODULE_STATE_LIVE: - break; - case MODULE_STATE_GOING: - ret = tracepoint_module_going(mod); - break; - } - return ret; -} - -struct notifier_block tracepoint_module_nb = { - .notifier_call = tracepoint_module_notify, - .priority = 0, -}; - -static int init_tracepoints(void) -{ - return register_module_notifier(&tracepoint_module_nb); -} -__initcall(init_tracepoints); -#endif /* CONFIG_MODULES */ - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - -/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ -static int sys_tracepoint_refcount; - -void syscall_regfunc(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { - /* Skip kernel threads. */ - if (t->mm) - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); - } - sys_tracepoint_refcount++; -} - -void syscall_unregfunc(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - sys_tracepoint_refcount--; - if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); - } -} -#endif -/* - * tsacct.c - System accounting over taskstats interface - * - * Copyright (C) Jay Lan, - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include - -/* - * fill in basic accounting fields - */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) -{ - const struct cred *tcred; - struct timespec uptime, ts; - u64 ac_etime; - - BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); - - /* calculate task elapsed time in timespec */ - do_posix_clock_monotonic_gettime(&uptime); - ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec (should never be negative) */ - ac_etime = timespec_to_ns(&ts); - do_div(ac_etime, NSEC_PER_USEC); - stats->ac_etime = ac_etime; - stats->ac_btime = get_seconds() - ts.tv_sec; - if (thread_group_leader(tsk)) { - stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } - if (tsk->flags & PF_SUPERPRIV) - stats->ac_flag |= ASU; - if (tsk->flags & PF_DUMPCORE) - stats->ac_flag |= ACORE; - if (tsk->flags & PF_SIGNALED) - stats->ac_flag |= AXSIG; - stats->ac_nice = task_nice(tsk); - stats->ac_sched = tsk->policy; - stats->ac_pid = tsk->pid; - rcu_read_lock(); - tcred = __task_cred(tsk); - stats->ac_uid = tcred->uid; - stats->ac_gid = tcred->gid; - stats->ac_ppid = pid_alive(tsk) ? - rcu_dereference(tsk->real_parent)->tgid : 0; - rcu_read_unlock(); - stats->ac_utime = cputime_to_usecs(tsk->utime); - stats->ac_stime = cputime_to_usecs(tsk->stime); - stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); - stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); - stats->ac_minflt = tsk->min_flt; - stats->ac_majflt = tsk->maj_flt; - - strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); -} - - -#ifdef CONFIG_TASK_XACCT - -#define KB 1024 -#define MB (1024*KB) -#define KB_MASK (~(KB-1)) -/* - * fill in extended accounting fields - */ -void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) -{ - struct mm_struct *mm; - - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; - mm = get_task_mm(p); - if (mm) { - /* adjust to KB unit */ - stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; - stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; - mmput(mm); - } - stats->read_char = p->ioac.rchar & KB_MASK; - stats->write_char = p->ioac.wchar & KB_MASK; - stats->read_syscalls = p->ioac.syscr & KB_MASK; - stats->write_syscalls = p->ioac.syscw & KB_MASK; -#ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes & KB_MASK; - stats->write_bytes = p->ioac.write_bytes & KB_MASK; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; -#else - stats->read_bytes = 0; - stats->write_bytes = 0; - stats->cancelled_write_bytes = 0; -#endif -} -#undef KB -#undef MB - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = tsk->stime + tsk->utime; - dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_timexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} -#endif -/* - * Wrapper functions for 16bit uid back compatibility. All nicely tied - * together in the faint hope we can take the out in five years time. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; -} - -SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; -} - -SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) -{ - long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, fd, user, group); - return ret; -} - -SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) -{ - long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, rgid, egid); - return ret; -} - -SYSCALL_DEFINE1(setgid16, old_gid_t, gid) -{ - long ret = sys_setgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; -} - -SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) -{ - long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, ruid, euid); - return ret; -} - -SYSCALL_DEFINE1(setuid16, old_uid_t, uid) -{ - long ret = sys_setuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; -} - -SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) -{ - long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), - low2highuid(suid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, ruid, euid, suid); - return ret; -} - -SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && - !(retval = put_user(high2lowuid(cred->euid), euid))) - retval = put_user(high2lowuid(cred->suid), suid); - - return retval; -} - -SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) -{ - long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), - low2highgid(sgid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, rgid, egid, sgid); - return ret; -} - - -SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) -{ - const struct cred *cred = current_cred(); - int retval; - - if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && - !(retval = put_user(high2lowgid(cred->egid), egid))) - retval = put_user(high2lowgid(cred->sgid), sgid); - - return retval; -} - -SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) -{ - long ret = sys_setfsuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; -} - -SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) -{ - long ret = sys_setfsgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; -} - -static int groups16_to_user(old_gid_t __user *grouplist, - struct group_info *group_info) -{ - int i; - old_gid_t group; - - for (i = 0; i < group_info->ngroups; i++) { - group = high2lowgid(GROUP_AT(group_info, i)); - if (put_user(group, grouplist+i)) - return -EFAULT; - } - - return 0; -} - -static int groups16_from_user(struct group_info *group_info, - old_gid_t __user *grouplist) -{ - int i; - old_gid_t group; - - for (i = 0; i < group_info->ngroups; i++) { - if (get_user(group, grouplist+i)) - return -EFAULT; - GROUP_AT(group_info, i) = low2highgid(group); - } - - return 0; -} - -SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist) -{ - const struct cred *cred = current_cred(); - int i; - - if (gidsetsize < 0) - return -EINVAL; - - i = cred->group_info->ngroups; - if (gidsetsize) { - if (i > gidsetsize) { - i = -EINVAL; - goto out; - } - if (groups16_to_user(grouplist, cred->group_info)) { - i = -EFAULT; - goto out; - } - } -out: - return i; -} - -SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) -{ - struct group_info *group_info; - int retval; - - if (!nsown_capable(CAP_SETGID)) - return -EPERM; - if ((unsigned)gidsetsize > NGROUPS_MAX) - return -EINVAL; - - group_info = groups_alloc(gidsetsize); - if (!group_info) - return -ENOMEM; - retval = groups16_from_user(group_info, grouplist); - if (retval) { - put_group_info(group_info); - return retval; - } - - retval = set_current_groups(group_info); - put_group_info(group_info); - - return retval; -} - -SYSCALL_DEFINE0(getuid16) -{ - return high2lowuid(current_uid()); -} - -SYSCALL_DEFINE0(geteuid16) -{ - return high2lowuid(current_euid()); -} - -SYSCALL_DEFINE0(getgid16) -{ - return high2lowgid(current_gid()); -} - -SYSCALL_DEFINE0(getegid16) -{ - return high2lowgid(current_egid()); -} -/* - * Uniprocessor-only support functions. The counterpart to kernel/smp.c - */ - -#include -#include -#include -#include - -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int wait) -{ - WARN_ON(cpu != 0); - - local_irq_disable(); - (func)(info); - local_irq_enable(); - - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - -#include -#include -#include -#include - -static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); - -/* - * Request a notification when the current cpu returns to userspace. Must be - * called in atomic context. The notifier will also be called in atomic - * context. - */ -void user_return_notifier_register(struct user_return_notifier *urn) -{ - set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); - hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); -} -EXPORT_SYMBOL_GPL(user_return_notifier_register); - -/* - * Removes a registered user return notifier. Must be called from atomic - * context, and from the same cpu registration occurred in. - */ -void user_return_notifier_unregister(struct user_return_notifier *urn) -{ - hlist_del(&urn->link); - if (hlist_empty(&__get_cpu_var(return_notifier_list))) - clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); -} -EXPORT_SYMBOL_GPL(user_return_notifier_unregister); - -/* Calls registered user return notifiers */ -void fire_user_return_notifiers(void) -{ - struct user_return_notifier *urn; - struct hlist_node *tmp1, *tmp2; - struct hlist_head *head; - - head = &get_cpu_var(return_notifier_list); - hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) - urn->on_user_return(urn); - put_cpu_var(return_notifier_list); -} -/* - * The "user cache". - * - * (C) Copyright 1991-2000 Linus Torvalds - * - * We have a per-user structure to keep track of how many - * processes, files etc the user has claimed, in order to be - * able to have per-user limits for system resources. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * userns count is 1 for root user, 1 for init_uts_ns, - * and 1 for... ? - */ -struct user_namespace init_user_ns = { - .kref = { - .refcount = ATOMIC_INIT(3), - }, - .creator = &root_user, -}; -EXPORT_SYMBOL_GPL(init_user_ns); - -/* - * UID task count cache, to get fast user lookup in "alloc_uid" - * when changing user ID's (ie setuid() and friends). - */ - -#define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) - -static struct kmem_cache *uid_cachep; - -/* - * The uidhash_lock is mostly taken from process context, but it is - * occasionally also taken from softirq/tasklet context, when - * task-structs get RCU-freed. Hence all locking must be softirq-safe. - * But free_uid() is also called with local interrupts disabled, and running - * local_bh_enable() with local interrupts disabled is an error - we'll run - * softirq callbacks, and they can unconditionally enable interrupts, and - * the caller of free_uid() didn't expect that.. - */ -static DEFINE_SPINLOCK(uidhash_lock); - -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ -struct user_struct root_user = { - .__count = ATOMIC_INIT(2), - .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), - .sigpending = ATOMIC_INIT(0), - .locked_shm = 0, - .user_ns = &init_user_ns, -}; - -/* - * These routines must be called with the uidhash spinlock held! - */ -static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) -{ - hlist_add_head(&up->uidhash_node, hashent); -} - -static void uid_hash_remove(struct user_struct *up) -{ - hlist_del_init(&up->uidhash_node); - put_user_ns(up->user_ns); -} - -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) -{ - struct user_struct *user; - struct hlist_node *h; - - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { - atomic_inc(&user->__count); - return user; - } - } - - return NULL; -} - -/* IRQs are disabled and uidhash_lock is held upon function entry. - * IRQ state (as stored in flags) is restored and uidhash_lock released - * upon function exit. - */ -static void free_user(struct user_struct *up, unsigned long flags) - __releases(&uidhash_lock) -{ - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); -} - -/* - * Locate the user_struct for the passed UID. If found, take a ref on it. The - * caller must undo that ref with free_uid(). - * - * If the user_struct could not be found, return NULL. - */ -struct user_struct *find_user(uid_t uid) -{ - struct user_struct *ret; - unsigned long flags; - struct user_namespace *ns = current_user_ns(); - - spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(ns, uid)); - spin_unlock_irqrestore(&uidhash_lock, flags); - return ret; -} - -void free_uid(struct user_struct *up) -{ - unsigned long flags; - - if (!up) - return; - - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) - free_user(up, flags); - else - local_irq_restore(flags); -} - -struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) -{ - struct hlist_head *hashent = uidhashentry(ns, uid); - struct user_struct *up, *new; - - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); - spin_unlock_irq(&uidhash_lock); - - if (!up) { - new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); - if (!new) - goto out_unlock; - - new->uid = uid; - atomic_set(&new->__count, 1); - - new->user_ns = get_user_ns(ns); - - /* - * Before adding this, check whether we raced - * on adding the same user already.. - */ - spin_lock_irq(&uidhash_lock); - up = uid_hash_find(uid, hashent); - if (up) { - put_user_ns(ns); - key_put(new->uid_keyring); - key_put(new->session_keyring); - kmem_cache_free(uid_cachep, new); - } else { - uid_hash_insert(new, hashent); - up = new; - } - spin_unlock_irq(&uidhash_lock); - } - - return up; - -out_unlock: - return NULL; -} - -static int __init uid_cache_init(void) -{ - int n; - - uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - - for(n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); - - /* Insert the root user immediately (init already runs as root) */ - spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); - spin_unlock_irq(&uidhash_lock); - - return 0; -} - -module_init(uid_cache_init); -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include -#include -#include - -static struct kmem_cache *user_ns_cachep __read_mostly; - -/* - * Create a new user namespace, deriving the creator from the user in the - * passed credentials, and replacing that user with the new root user for the - * new namespace. - * - * This is called by copy_creds(), which will finish setting the target task's - * credentials. - */ -int create_user_ns(struct cred *new) -{ - struct user_namespace *ns; - struct user_struct *root_user; - int n; - - ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); - if (!ns) - return -ENOMEM; - - kref_init(&ns->kref); - - for (n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(ns->uidhash_table + n); - - /* Alloc new root user. */ - root_user = alloc_uid(ns, 0); - if (!root_user) { - kmem_cache_free(user_ns_cachep, ns); - return -ENOMEM; - } - - /* set the new root user in the credentials under preparation */ - ns->creator = new->user; - new->user = root_user; - new->uid = new->euid = new->suid = new->fsuid = 0; - new->gid = new->egid = new->sgid = new->fsgid = 0; - put_group_info(new->group_info); - new->group_info = get_group_info(&init_groups); -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* root_user holds a reference to ns, our reference can be dropped */ - put_user_ns(ns); - - return 0; -} - -/* - * Deferred destructor for a user namespace. This is required because - * free_user_ns() may be called with uidhash_lock held, but we need to call - * back to free_uid() which will want to take the lock again. - */ -static void free_user_ns_work(struct work_struct *work) -{ - struct user_namespace *ns = - container_of(work, struct user_namespace, destroyer); - free_uid(ns->creator); - kmem_cache_free(user_ns_cachep, ns); -} - -void free_user_ns(struct kref *kref) -{ - struct user_namespace *ns = - container_of(kref, struct user_namespace, kref); - - INIT_WORK(&ns->destroyer, free_user_ns_work); - schedule_work(&ns->destroyer); -} -EXPORT_SYMBOL(free_user_ns); - -uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) -{ - struct user_namespace *tmp; - - if (likely(to == cred->user->user_ns)) - return uid; - - - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? - */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (uid_t)0; - } - } - - /* No useful relationship so no mapping */ - return overflowuid; -} - -gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) -{ - struct user_namespace *tmp; - - if (likely(to == cred->user->user_ns)) - return gid; - - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? - */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (gid_t)0; - } - } - - /* No useful relationship so no mapping */ - return overflowgid; -} - -static __init int user_namespaces_init(void) -{ - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); - return 0; -} -module_init(user_namespaces_init); -/* - * Copyright (C) 2004 IBM Corporation - * - * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include -#include -#include -#include - -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); - if (uts_ns) - kref_init(&uts_ns->kref); - return uts_ns; -} - -/* - * Clone a new ns copying an original utsname, setting refcount to 1 - * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise - */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, - struct uts_namespace *old_ns) -{ - struct uts_namespace *ns; - - ns = create_uts_ns(); - if (!ns) - return ERR_PTR(-ENOMEM); - - down_read(&uts_sem); - memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); - up_read(&uts_sem); - return ns; -} - -/* - * Copy task tsk's utsname namespace, or clone it if flags - * specifies CLONE_NEWUTS. In latter case, changes to the - * utsname of this process won't be seen by parent, and vice - * versa. - */ -struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) -{ - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; - struct uts_namespace *new_ns; - - BUG_ON(!old_ns); - get_uts_ns(old_ns); - - if (!(flags & CLONE_NEWUTS)) - return old_ns; - - new_ns = clone_uts_ns(tsk, old_ns); - - put_uts_ns(old_ns); - return new_ns; -} - -void free_uts_ns(struct kref *kref) -{ - struct uts_namespace *ns; - - ns = container_of(kref, struct uts_namespace, kref); - put_user_ns(ns->user_ns); - kfree(ns); -} - -static void *utsns_get(struct task_struct *task) -{ - struct uts_namespace *ns = NULL; - struct nsproxy *nsproxy; - - rcu_read_lock(); - nsproxy = task_nsproxy(task); - if (nsproxy) { - ns = nsproxy->uts_ns; - get_uts_ns(ns); - } - rcu_read_unlock(); - - return ns; -} - -static void utsns_put(void *ns) -{ - put_uts_ns(ns); -} - -static int utsns_install(struct nsproxy *nsproxy, void *ns) -{ - get_uts_ns(ns); - put_uts_ns(nsproxy->uts_ns); - nsproxy->uts_ns = ns; - return 0; -} - -const struct proc_ns_operations utsns_operations = { - .name = "uts", - .type = CLONE_NEWUTS, - .get = utsns_get, - .put = utsns_put, - .install = utsns_install, -}; - -/* - * Copyright (C) 2007 - * - * Author: Eric Biederman - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ - -#include -#include -#include -#include -#include - -static void *get_uts(ctl_table *table, int write) -{ - char *which = table->data; - struct uts_namespace *uts_ns; - - uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); - return which; -} - -static void put_uts(ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - -#ifdef CONFIG_PROC_SYSCTL -/* - * Special case of dostring for the UTS structure. This has locks - * to observe. Should this be in kernel/sys.c ???? - */ -static int proc_do_uts_string(ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table uts_table; - int r; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); - put_uts(table, write, uts_table.data); - - if (write) - proc_sys_poll_notify(table->poll); - - return r; -} -#else -#define proc_do_uts_string NULL -#endif - -static DEFINE_CTL_TABLE_POLL(hostname_poll); -static DEFINE_CTL_TABLE_POLL(domainname_poll); - -static struct ctl_table uts_kern_table[] = { - { - .procname = "ostype", - .data = init_uts_ns.name.sysname, - .maxlen = sizeof(init_uts_ns.name.sysname), - .mode = 0444, - .proc_handler = proc_do_uts_string, - }, - { - .procname = "osrelease", - .data = init_uts_ns.name.release, - .maxlen = sizeof(init_uts_ns.name.release), - .mode = 0444, - .proc_handler = proc_do_uts_string, - }, - { - .procname = "version", - .data = init_uts_ns.name.version, - .maxlen = sizeof(init_uts_ns.name.version), - .mode = 0444, - .proc_handler = proc_do_uts_string, - }, - { - .procname = "hostname", - .data = init_uts_ns.name.nodename, - .maxlen = sizeof(init_uts_ns.name.nodename), - .mode = 0644, - .proc_handler = proc_do_uts_string, - .poll = &hostname_poll, - }, - { - .procname = "domainname", - .data = init_uts_ns.name.domainname, - .maxlen = sizeof(init_uts_ns.name.domainname), - .mode = 0644, - .proc_handler = proc_do_uts_string, - .poll = &domainname_poll, - }, - {} -}; - -static struct ctl_table uts_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = uts_kern_table, - }, - {} -}; - -#ifdef CONFIG_PROC_SYSCTL -/* - * Notify userspace about a change in a certain entry of uts_kern_table, - * identified by the parameter proc. - */ -void uts_proc_notify(enum uts_proc proc) -{ - struct ctl_table *table = &uts_kern_table[proc]; - - proc_sys_poll_notify(table->poll); -} -#endif - -static int __init utsname_sysctl_init(void) -{ - register_sysctl_table(uts_root_table); - return 0; -} - -__initcall(utsname_sysctl_init); -/* - * Generic waiting primitives. - * - * (C) 2004 William Irwin, Oracle - */ -#include -#include -#include -#include -#include -#include - -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) -{ - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); -} - -EXPORT_SYMBOL(__init_waitqueue_head); - -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue); - -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue_exclusive); - -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the critical region). - */ -void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait); - -void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -/** - * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_wait); - -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @mode: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} -EXPORT_SYMBOL(autoremove_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - do { - int ret; - - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(q->key.flags); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); -/* - * Detect hard and soft lockups on a system - * - * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. - * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks - * to those contributors as well. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -int watchdog_enabled = 1; -int __read_mostly watchdog_thresh = 10; - -static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); -static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); -static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, soft_watchdog_warn); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif - -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled = 0; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int __init nowatchdog_setup(char *str) -{ - watchdog_enabled = 0; - return 1; -} -__setup("nowatchdog", nowatchdog_setup); - -/* deprecated */ -static int __init nosoftlockup_setup(char *str) -{ - watchdog_enabled = 0; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); -/* */ - -/* - * Hard-lockup warnings should be triggered after just a few seconds. Soft- - * lockups can have false positives under extreme conditions. So we generally - * want a higher threshold for soft lockups than for hard lockups. So we couple - * the thresholds with a factor: we make the soft threshold twice the amount of - * time the hard threshold is. - */ -static int get_softlockup_thresh(void) -{ - return watchdog_thresh * 2; -} - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static unsigned long get_sample_period(void) -{ - /* - * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer 5 chances to - * increment before the hardlockup detector generates - * a warning - */ - return get_softlockup_thresh() * (NSEC_PER_SEC / 5); -} - -/* Commands for resetting the watchdog */ -static void __touch_watchdog(void) -{ - int this_cpu = smp_processor_id(); - - __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); -} - -void touch_softlockup_watchdog(void) -{ - __this_cpu_write(watchdog_touch_ts, 0); -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later - */ - for_each_online_cpu(cpu) - per_cpu(watchdog_touch_ts, cpu) = 0; -} - -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - if (watchdog_enabled) { - unsigned cpu; - - for_each_present_cpu(cpu) { - if (per_cpu(watchdog_nmi_touch, cpu) != true) - per_cpu(watchdog_nmi_touch, cpu) = true; - } - } - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - -void touch_softlockup_watchdog_sync(void) -{ - __raw_get_cpu_var(softlockup_touch_sync) = true; - __raw_get_cpu_var(watchdog_touch_ts) = 0; -} - -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* watchdog detector functions */ -static int is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return 1; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return 0; -} -#endif - -static int is_softlockup(unsigned long touch_ts) -{ - unsigned long now = get_timestamp(smp_processor_id()); - - /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + get_softlockup_thresh())) - return now - touch_ts; - - return 0; -} - -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -static void watchdog_interrupt_count(void) -{ - __this_cpu_inc(hrtimer_interrupts); -} -#else -static inline void watchdog_interrupt_count(void) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - -/* watchdog kicker functions */ -static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) -{ - unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); - struct pt_regs *regs = get_irq_regs(); - int duration; - - /* kick the hardlockup detector */ - watchdog_interrupt_count(); - - /* kick the softlockup detector */ - wake_up_process(__this_cpu_read(softlockup_watchdog)); - - /* .. and repeat */ - hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); - - if (touch_ts == 0) { - if (unlikely(__this_cpu_read(softlockup_touch_sync))) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - __this_cpu_write(softlockup_touch_sync, false); - sched_clock_tick(); - } - __touch_watchdog(); - return HRTIMER_RESTART; - } - - /* check for a softlockup - * This is done by making sure a high priority task is - * being scheduled. The task touches the watchdog to - * indicate it is getting cpu time. If it hasn't then - * this is a good indication some task is hogging the cpu - */ - duration = is_softlockup(touch_ts); - if (unlikely(duration)) { - /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) - return HRTIMER_RESTART; - - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", - smp_processor_id(), duration, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - if (softlockup_panic) - panic("softlockup: hung tasks"); - __this_cpu_write(soft_watchdog_warn, true); - } else - __this_cpu_write(soft_watchdog_warn, false); - - return HRTIMER_RESTART; -} - - -/* - * The watchdog thread - touches the timestamp. - */ -static int watchdog(void *unused) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_watchdog(); - - /* kick off the timer for the hardlockup detector */ - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), - HRTIMER_MODE_REL_PINNED); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in watchdog_timer_fn(). - */ - while (!kthread_should_stop()) { - __touch_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - param.sched_priority = 0; - sched_setscheduler(current, SCHED_NORMAL, ¶m); - return 0; -} - - -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int watchdog_nmi_enable(int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - if (!IS_ERR(event)) { - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); - goto out_save; - } - - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); - else - printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - return; -} -#else -static int watchdog_nmi_enable(int cpu) { return 0; } -static void watchdog_nmi_disable(int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - -/* prepare/enable/disable routines */ -static void watchdog_prepare_cpu(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - WARN_ON(per_cpu(softlockup_watchdog, cpu)); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; -} - -static int watchdog_enable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err = 0; - - /* enable the perf event */ - err = watchdog_nmi_enable(cpu); - - /* Regardless of err above, fall through and start softlockup */ - - /* create the watchdog thread */ - if (!p) { - p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); - if (IS_ERR(p)) { - printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - if (!err) { - /* if hardlockup hasn't already set this */ - err = PTR_ERR(p); - /* and disable the perf event */ - watchdog_nmi_disable(cpu); - } - goto out; - } - kthread_bind(p, cpu); - per_cpu(watchdog_touch_ts, cpu) = 0; - per_cpu(softlockup_watchdog, cpu) = p; - wake_up_process(p); - } - -out: - return err; -} - -static void watchdog_disable(int cpu) -{ - struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); - - /* - * cancel the timer first to stop incrementing the stats - * and waking up the kthread - */ - hrtimer_cancel(hrtimer); - - /* disable the perf event */ - watchdog_nmi_disable(cpu); - - /* stop the watchdog thread */ - if (p) { - per_cpu(softlockup_watchdog, cpu) = NULL; - kthread_stop(p); - } -} - -/* sysctl functions */ -#ifdef CONFIG_SYSCTL -static void watchdog_enable_all_cpus(void) -{ - int cpu; - - watchdog_enabled = 0; - - for_each_online_cpu(cpu) - if (!watchdog_enable(cpu)) - /* if any cpu succeeds, watchdog is considered - enabled for the system */ - watchdog_enabled = 1; - - if (!watchdog_enabled) - printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); - -} - -static void watchdog_disable_all_cpus(void) -{ - int cpu; - - for_each_online_cpu(cpu) - watchdog_disable(cpu); - - /* if all watchdogs are disabled, then they are disabled for the system */ - watchdog_enabled = 0; -} - - -/* - * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh - */ - -int proc_dowatchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - goto out; - - if (watchdog_enabled && watchdog_thresh) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - -out: - return ret; -} -#endif /* CONFIG_SYSCTL */ - - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - watchdog_prepare_cpu(hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (watchdog_enabled) - watchdog_enable(hotcpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - watchdog_disable(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - watchdog_disable(hotcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - - /* - * hardlockup and softlockup are not important enough - * to block cpu bring up. Just always succeed and - * rely on printk output to flag problems. - */ - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -void __init lockup_detector_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(notifier_to_errno(err)); - - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - return; -} -/* - * kernel/workqueue.c - generic async execution with shared worker pool - * - * Copyright (C) 2002 Ingo Molnar - * - * Derived from the taskqueue/keventd code by: - * David Woodhouse - * Andrew Morton - * Kai Petzke - * Theodore Ts'o - * - * Made to use alloc_percpu by Christoph Lameter. - * - * Copyright (C) 2010 SUSE Linux Products GmbH - * Copyright (C) 2010 Tejun Heo - * - * This is the generic async execution mechanism. Work items as are - * executed in process context. The worker pool is shared and - * automatically managed. There is one worker pool for each CPU and - * one extra for works which are better served by workers which are - * not bound to any specific CPU. - * - * Please read Documentation/workqueue.txt for details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "workqueue_sched.h" - -enum { - /* global_cwq flags */ - GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ - GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 3, /* freeze in progress */ - GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ - - /* worker flags */ - WORKER_STARTED = 1 << 0, /* started */ - WORKER_DIE = 1 << 1, /* die die die */ - WORKER_IDLE = 1 << 2, /* is idle */ - WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ - WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ - WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, - - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ - - BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ - BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, - BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, - - MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ - IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ - - MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, - /* call for help after 10ms - (min two ticks) */ - MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ - CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ - - /* - * Rescue workers are used only on emergencies and shared by - * all cpus. Give -20. - */ - RESCUER_NICE_LEVEL = -20, -}; - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only for - * everyone else. - * - * P: Preemption protected. Disabling preemption is enough and should - * only be modified and accessed from the local cpu. - * - * L: gcwq->lock protected. Access with gcwq->lock held. - * - * X: During normal operation, modification requires gcwq->lock and - * should be done only from local cpu. Either disabling preemption - * on local cpu or grabbing gcwq->lock is enough for read access. - * If GCWQ_DISASSOCIATED is set, it's identical to L. - * - * F: wq->flush_mutex protected. - * - * W: workqueue_lock protected. - */ - -struct global_cwq; - -/* - * The poor guys doing the actual heavy lifting. All on-duty workers - * are either serving the manager role, on idle list or on busy hash. - */ -struct worker { - /* on idle list while idle, on busy hash table while busy */ - union { - struct list_head entry; /* L: while idle */ - struct hlist_node hentry; /* L: while busy */ - }; - - struct work_struct *current_work; /* L: work being processed */ - struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ - struct list_head scheduled; /* L: scheduled works */ - struct task_struct *task; /* I: worker task */ - struct global_cwq *gcwq; /* I: the associated gcwq */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ - unsigned long last_active; /* L: last active timestamp */ - unsigned int flags; /* X: flags */ - int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ -}; - -/* - * Global per-cpu workqueue. There's one and only one for each cpu - * and all works are queued and processed here regardless of their - * target workqueues. - */ -struct global_cwq { - spinlock_t lock; /* the gcwq lock */ - struct list_head worklist; /* L: list of pending works */ - unsigned int cpu; /* I: the associated cpu */ - unsigned int flags; /* L: GCWQ_* flags */ - - int nr_workers; /* L: total number of workers */ - int nr_idle; /* L: currently idle ones */ - - /* workers are chained either in the idle_list or busy_hash */ - struct list_head idle_list; /* X: list of idle workers */ - struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; - /* L: hash of busy workers */ - - struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for dworkers */ - - struct ida worker_ida; /* L: for worker IDs */ - - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ - struct worker *first_idle; /* L: first idle worker */ -} ____cacheline_aligned_in_smp; - -/* - * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of - * work_struct->data are used for flags and thus cwqs need to be - * aligned at two's power of the number of flag bits. - */ -struct cpu_workqueue_struct { - struct global_cwq *gcwq; /* I: the associated gcwq */ - struct workqueue_struct *wq; /* I: the owning workqueue */ - int work_color; /* L: current color */ - int flush_color; /* L: flushing color */ - int nr_in_flight[WORK_NR_COLORS]; - /* L: nr of in_flight works */ - int nr_active; /* L: nr of active works */ - int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ -}; - -/* - * Structure used to wait for workqueue flush. - */ -struct wq_flusher { - struct list_head list; /* F: list of flushers */ - int flush_color; /* F: flush color waiting for */ - struct completion done; /* flush completion */ -}; - -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) \ - cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask) free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp) true -#define free_mayday_mask(mask) do { } while (0) -#endif - -/* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: - */ -struct workqueue_struct { - unsigned int flags; /* W: WQ_* flags */ - union { - struct cpu_workqueue_struct __percpu *pcpu; - struct cpu_workqueue_struct *single; - unsigned long v; - } cpu_wq; /* I: cwq's */ - struct list_head list; /* W: list of all workqueues */ - - struct mutex flush_mutex; /* protects wq flushing */ - int work_color; /* F: current work color */ - int flush_color; /* F: current flush color */ - atomic_t nr_cwqs_to_flush; /* flush in progress */ - struct wq_flusher *first_flusher; /* F: first flusher */ - struct list_head flusher_queue; /* F: flush waiters */ - struct list_head flusher_overflow; /* F: flush overflow list */ - - mayday_mask_t mayday_mask; /* cpus requesting rescue */ - struct worker *rescuer; /* I: rescue worker */ - - int nr_drainers; /* W: drain in progress */ - int saved_max_active; /* W: saved cwq max_active */ -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif - char name[]; /* I: workqueue name */ -}; - -struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_wq); -EXPORT_SYMBOL_GPL(system_long_wq); -EXPORT_SYMBOL_GPL(system_nrt_wq); -EXPORT_SYMBOL_GPL(system_unbound_wq); -EXPORT_SYMBOL_GPL(system_freezable_wq); -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); - -#define CREATE_TRACE_POINTS -#include - -#define for_each_busy_worker(worker, i, pos, gcwq) \ - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ - hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) - -static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) -{ - if (cpu < nr_cpu_ids) { - if (sw & 1) { - cpu = cpumask_next(cpu, mask); - if (cpu < nr_cpu_ids) - return cpu; - } - if (sw & 2) - return WORK_CPU_UNBOUND; - } - return WORK_CPU_NONE; -} - -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) -{ - return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} - -/* - * CPU iterators - * - * An extra gcwq is defined for an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to - * for_each_*_cpu() iterators but also considers the unbound gcwq. - * - * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_cwq_cpu() : possible CPUs for bound workqueues, - * WORK_CPU_UNBOUND for unbound workqueues - */ -#define for_each_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) - -#define for_each_online_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) - -#define for_each_cwq_cpu(cpu, wq) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) - -#ifdef CONFIG_DEBUG_OBJECTS_WORK - -static struct debug_obj_descr work_debug_descr; - -static void *work_debug_hint(void *addr) -{ - return ((struct work_struct *) addr)->func; -} - -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int work_fixup_init(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - cancel_work_sync(work); - debug_object_init(work, &work_debug_descr); - return 1; - default: - return 0; - } -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return 0; - } - WARN_ON_ONCE(1); - return 0; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int work_fixup_free(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - cancel_work_sync(work); - debug_object_free(work, &work_debug_descr); - return 1; - default: - return 0; - } -} - -static struct debug_obj_descr work_debug_descr = { - .name = "work_struct", - .debug_hint = work_debug_hint, - .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, - .fixup_free = work_fixup_free, -}; - -static inline void debug_work_activate(struct work_struct *work) -{ - debug_object_activate(work, &work_debug_descr); -} - -static inline void debug_work_deactivate(struct work_struct *work) -{ - debug_object_deactivate(work, &work_debug_descr); -} - -void __init_work(struct work_struct *work, int onstack) -{ - if (onstack) - debug_object_init_on_stack(work, &work_debug_descr); - else - debug_object_init(work, &work_debug_descr); -} -EXPORT_SYMBOL_GPL(__init_work); - -void destroy_work_on_stack(struct work_struct *work) -{ - debug_object_free(work, &work_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_work_on_stack); - -#else -static inline void debug_work_activate(struct work_struct *work) { } -static inline void debug_work_deactivate(struct work_struct *work) { } -#endif - -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ - -/* - * The almighty global cpu workqueues. nr_running is the only field - * which is expected to be used frequently by other cpus via - * try_to_wake_up(). Put it in a separate cacheline. - */ -static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); - -/* - * Global cpu workqueue and nr_running counter for unbound gcwq. The - * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its - * workers have WORKER_UNBOUND set. - */ -static struct global_cwq unbound_global_cwq; -static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ - -static int worker_thread(void *__worker); - -static struct global_cwq *get_gcwq(unsigned int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(global_cwq, cpu); - else - return &unbound_global_cwq; -} - -static atomic_t *get_gcwq_nr_running(unsigned int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(gcwq_nr_running, cpu); - else - return &unbound_gcwq_nr_running; -} - -static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, - struct workqueue_struct *wq) -{ - if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP - return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else - return wq->cpu_wq.single; -#endif - } - } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->cpu_wq.single; - return NULL; -} - -static unsigned int work_color_to_flags(int color) -{ - return color << WORK_STRUCT_COLOR_SHIFT; -} - -static int get_work_color(struct work_struct *work) -{ - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & - ((1 << WORK_STRUCT_COLOR_BITS) - 1); -} - -static int work_next_color(int color) -{ - return (color + 1) % WORK_NR_COLORS; -} - -/* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue. Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. - * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data. These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. - * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work. gcwq is available once the work has been - * queued anywhere after initialization. cwq is available only from - * queueing until execution starts. - */ -static inline void set_work_data(struct work_struct *work, unsigned long data, - unsigned long flags) -{ - BUG_ON(!work_pending(work)); - atomic_long_set(&work->data, data | flags | work_static(work)); -} - -static void set_work_cwq(struct work_struct *work, - struct cpu_workqueue_struct *cwq, - unsigned long extra_flags) -{ - set_work_data(work, (unsigned long)cwq, - WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); -} - -static void set_work_cpu(struct work_struct *work, unsigned int cpu) -{ - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); -} - -static void clear_work_data(struct work_struct *work) -{ - set_work_data(work, WORK_STRUCT_NO_CPU, 0); -} - -static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) -{ - unsigned long data = atomic_long_read(&work->data); - - if (data & WORK_STRUCT_CWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); - else - return NULL; -} - -static struct global_cwq *get_work_gcwq(struct work_struct *work) -{ - unsigned long data = atomic_long_read(&work->data); - unsigned int cpu; - - if (data & WORK_STRUCT_CWQ) - return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; - - cpu = data >> WORK_STRUCT_FLAG_BITS; - if (cpu == WORK_CPU_NONE) - return NULL; - - BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); - return get_gcwq(cpu); -} - -/* - * Policy functions. These define the policies on how the global - * worker pool is managed. Unless noted otherwise, these functions - * assume that they're being called with gcwq->lock held. - */ - -static bool __need_more_worker(struct global_cwq *gcwq) -{ - return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || - gcwq->flags & GCWQ_HIGHPRI_PENDING; -} - -/* - * Need to wake up a worker? Called from anything but currently - * running workers. - */ -static bool need_more_worker(struct global_cwq *gcwq) -{ - return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); -} - -/* Can I start working? Called from busy but !running workers. */ -static bool may_start_working(struct global_cwq *gcwq) -{ - return gcwq->nr_idle; -} - -/* Do I need to keep working? Called from currently running workers. */ -static bool keep_working(struct global_cwq *gcwq) -{ - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); - - return !list_empty(&gcwq->worklist) && - (atomic_read(nr_running) <= 1 || - gcwq->flags & GCWQ_HIGHPRI_PENDING); -} - -/* Do we need a new worker? Called from manager. */ -static bool need_to_create_worker(struct global_cwq *gcwq) -{ - return need_more_worker(gcwq) && !may_start_working(gcwq); -} - -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct global_cwq *gcwq) -{ - return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; -} - -/* Do we have too many workers and should some go away? */ -static bool too_many_workers(struct global_cwq *gcwq) -{ - bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->nr_workers - nr_idle; - - return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; -} - -/* - * Wake up functions. - */ - -/* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct global_cwq *gcwq) -{ - if (unlikely(list_empty(&gcwq->idle_list))) - return NULL; - - return list_first_entry(&gcwq->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @gcwq: gcwq to wake worker for - * - * Wake up the first idle worker of @gcwq. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void wake_up_worker(struct global_cwq *gcwq) -{ - struct worker *worker = first_worker(gcwq); - - if (likely(worker)) - wake_up_process(worker->task); -} - -/** - * wq_worker_waking_up - a worker is waking up - * @task: task waking up - * @cpu: CPU @task is waking up to - * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) -{ - struct worker *worker = kthread_data(task); - - if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); -} - -/** - * wq_worker_sleeping - a worker is going to sleep - * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number - * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * RETURNS: - * Worker task on @cpu to wake up, %NULL if none. - */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) -{ - struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); - - if (worker->flags & WORKER_NOT_RUNNING) - return NULL; - - /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); - - /* - * The counterpart of the following dec_and_test, implied mb, - * worklist not empty test sequence is in insert_work(). - * Please read comment there. - * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. - */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); - return to_wakeup ? to_wakeup->task : NULL; -} - -/** - * worker_set_flags - set worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to set - * @wakeup: wakeup an idle worker if necessary - * - * Set @flags in @worker->flags and adjust nr_running accordingly. If - * nr_running becomes zero and @wakeup is %true, an idle worker is - * woken up. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) - */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags, - bool wakeup) -{ - struct global_cwq *gcwq = worker->gcwq; - - WARN_ON_ONCE(worker->task != current); - - /* - * If transitioning into NOT_RUNNING, adjust nr_running and - * wake up an idle worker as necessary if requested by - * @wakeup. - */ - if ((flags & WORKER_NOT_RUNNING) && - !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); - - if (wakeup) { - if (atomic_dec_and_test(nr_running) && - !list_empty(&gcwq->worklist)) - wake_up_worker(gcwq); - } else - atomic_dec(nr_running); - } - - worker->flags |= flags; -} - -/** - * worker_clr_flags - clear worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to clear - * - * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) - */ -static inline void worker_clr_flags(struct worker *worker, unsigned int flags) -{ - struct global_cwq *gcwq = worker->gcwq; - unsigned int oflags = worker->flags; - - WARN_ON_ONCE(worker->task != current); - - worker->flags &= ~flags; - - /* - * If transitioning out of NOT_RUNNING, increment nr_running. Note - * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask - * of multiple flags, not a single flag. - */ - if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) - if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(gcwq->cpu)); -} - -/** - * busy_worker_head - return the busy hash head for a work - * @gcwq: gcwq of interest - * @work: work to be hashed - * - * Return hash head of @gcwq for @work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to the hash head. - */ -static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, - struct work_struct *work) -{ - const int base_shift = ilog2(sizeof(struct work_struct)); - unsigned long v = (unsigned long)work; - - /* simple shift and fold hash, do we need something better? */ - v >>= base_shift; - v += v >> BUSY_WORKER_HASH_ORDER; - v &= BUSY_WORKER_HASH_MASK; - - return &gcwq->busy_hash[v]; -} - -/** - * __find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @bwh: hash head as returned by busy_worker_head() - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq. @bwh should be - * the hash head obtained by calling busy_worker_head() with the same - * work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL - * otherwise. - */ -static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, - struct hlist_head *bwh, - struct work_struct *work) -{ - struct worker *worker; - struct hlist_node *tmp; - - hlist_for_each_entry(worker, tmp, bwh, hentry) - if (worker->current_work == work) - return worker; - return NULL; -} - -/** - * find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq. This function is - * identical to __find_worker_executing_work() except that this - * function calculates @bwh itself. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct global_cwq *gcwq, - struct work_struct *work) -{ - return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), - work); -} - -/** - * gcwq_determine_ins_pos - find insertion position - * @gcwq: gcwq of interest - * @cwq: cwq a work is being queued for - * - * A work for @cwq is about to be queued on @gcwq, determine insertion - * position for the work. If @cwq is for HIGHPRI wq, the work is - * queued at the head of the queue but in FIFO order with respect to - * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that - * there are HIGHPRI works pending. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to inserstion position. - */ -static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, - struct cpu_workqueue_struct *cwq) -{ - struct work_struct *twork; - - if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->worklist; - - list_for_each_entry(twork, &gcwq->worklist, entry) { - struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); - - if (!(tcwq->wq->flags & WQ_HIGHPRI)) - break; - } - - gcwq->flags |= GCWQ_HIGHPRI_PENDING; - return &twork->entry; -} - -/** - * insert_work - insert a work into gcwq - * @cwq: cwq @work belongs to - * @work: work to insert - * @head: insertion point - * @extra_flags: extra WORK_STRUCT_* flags to set - * - * Insert @work which belongs to @cwq into @gcwq after @head. - * @extra_flags is or'd to work_struct flags. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head, - unsigned int extra_flags) -{ - struct global_cwq *gcwq = cwq->gcwq; - - /* we own @work, set data and link */ - set_work_cwq(work, cwq, extra_flags); - - /* - * Ensure that we get the right work->data if we see the - * result of list_add() below, see try_to_grab_pending(). - */ - smp_wmb(); - - list_add_tail(&work->entry, head); - - /* - * Ensure either worker_sched_deactivated() sees the above - * list_add_tail() or we see zero nr_running to avoid workers - * lying around lazily while there are works to be processed. - */ - smp_mb(); - - if (__need_more_worker(gcwq)) - wake_up_worker(gcwq); -} - -/* - * Test whether @work is being queued from another work executing on the - * same workqueue. This is rather expensive and should only be used from - * cold paths. - */ -static bool is_chained_work(struct workqueue_struct *wq) -{ - unsigned long flags; - unsigned int cpu; - - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; - struct hlist_node *pos; - int i; - - spin_lock_irqsave(&gcwq->lock, flags); - for_each_busy_worker(worker, i, pos, gcwq) { - if (worker->task != current) - continue; - spin_unlock_irqrestore(&gcwq->lock, flags); - /* - * I'm @worker, no locking necessary. See if @work - * is headed to the same workqueue. - */ - return worker->current_cwq->wq == wq; - } - spin_unlock_irqrestore(&gcwq->lock, flags); - } - return false; -} - -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, - struct work_struct *work) -{ - struct global_cwq *gcwq; - struct cpu_workqueue_struct *cwq; - struct list_head *worklist; - unsigned int work_flags; - unsigned long flags; - - debug_work_activate(work); - - /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq))) - return; - - /* determine gcwq to use */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *last_gcwq; - - if (unlikely(cpu == WORK_CPU_UNBOUND)) - cpu = raw_smp_processor_id(); - - /* - * It's multi cpu. If @wq is non-reentrant and @work - * was previously on a different cpu, it might still - * be running there, in which case the work needs to - * be queued on that cpu to guarantee non-reentrance. - */ - gcwq = get_gcwq(cpu); - if (wq->flags & WQ_NON_REENTRANT && - (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { - struct worker *worker; - - spin_lock_irqsave(&last_gcwq->lock, flags); - - worker = find_worker_executing_work(last_gcwq, work); - - if (worker && worker->current_cwq->wq == wq) - gcwq = last_gcwq; - else { - /* meh... not running there, queue here */ - spin_unlock_irqrestore(&last_gcwq->lock, flags); - spin_lock_irqsave(&gcwq->lock, flags); - } - } else - spin_lock_irqsave(&gcwq->lock, flags); - } else { - gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock_irqsave(&gcwq->lock, flags); - } - - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); - - BUG_ON(!list_empty(&work->entry)); - - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); - - if (likely(cwq->nr_active < cwq->max_active)) { - trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; - } - - insert_work(cwq, work, worklist, work_flags); - - spin_unlock_irqrestore(&gcwq->lock, flags); -} - -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; -} -EXPORT_SYMBOL_GPL(queue_work); - -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) -{ - int ret = 0; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_work_on); - -static void delayed_work_timer_fn(unsigned long __data) -{ - struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); -} - -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - if (delay == 0) - return queue_work(wq, &dwork->work); - - return queue_delayed_work_on(-1, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/** - * queue_delayed_work_on - queue work on specific CPU after delay - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - int ret = 0; - struct timer_list *timer = &dwork->timer; - struct work_struct *work = &dwork->work; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - unsigned int lcpu; - - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); - - /* - * This stores cwq for the moment, for the timer_fn. - * Note that the work's gcwq is preserved to allow - * reentrance detection for delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); - - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) - lcpu = gcwq->cpu; - else - lcpu = raw_smp_processor_id(); - } else - lcpu = WORK_CPU_UNBOUND; - - set_work_cwq(work, get_cwq(lcpu, wq), 0); - - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; - - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; - } - return ret; -} -EXPORT_SYMBOL_GPL(queue_delayed_work_on); - -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * spin_lock_irq(gcwq->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct global_cwq *gcwq = worker->gcwq; - - BUG_ON(worker->flags & WORKER_IDLE); - BUG_ON(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev)); - - /* can't use worker_set_flags(), also called from start_worker() */ - worker->flags |= WORKER_IDLE; - gcwq->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &gcwq->idle_list); - - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); - - /* sanity check nr_running */ - WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && - atomic_read(get_gcwq_nr_running(gcwq->cpu))); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * spin_lock_irq(gcwq->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct global_cwq *gcwq = worker->gcwq; - - BUG_ON(!(worker->flags & WORKER_IDLE)); - worker_clr_flags(worker, WORKER_IDLE); - gcwq->nr_idle--; - list_del_init(&worker->entry); -} - -/** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq - * @worker: self - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by rogue workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online. kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and gcwq may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep. Called without any lock but returns with gcwq->lock - * held. - * - * RETURNS: - * %true if the associated gcwq is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker *worker) -__acquires(&gcwq->lock) -{ - struct global_cwq *gcwq = worker->gcwq; - struct task_struct *task = worker->task; - - while (true) { - /* - * The following call may fail, succeed or succeed - * without actually migrating the task to the cpu if - * it races with cpu hotunplug operation. Verify - * against GCWQ_DISASSOCIATED. - */ - if (!(gcwq->flags & GCWQ_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); - - spin_lock_irq(&gcwq->lock); - if (gcwq->flags & GCWQ_DISASSOCIATED) - return false; - if (task_cpu(task) == gcwq->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(gcwq->cpu))) - return true; - spin_unlock_irq(&gcwq->lock); - - /* - * We've raced with CPU hot[un]plug. Give it a breather - * and retry migration. cond_resched() is required here; - * otherwise, we might deadlock against cpu_stop trying to - * bring down the CPU on non-preemptive kernel. - */ - cpu_relax(); - cond_resched(); - } -} - -/* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. - */ -static void worker_rebind_fn(struct work_struct *work) -{ - struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->gcwq; - - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); - - spin_unlock_irq(&gcwq->lock); -} - -static struct worker *alloc_worker(void) -{ - struct worker *worker; - - worker = kzalloc(sizeof(*worker), GFP_KERNEL); - if (worker) { - INIT_LIST_HEAD(&worker->entry); - INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); - /* on creation a worker is in !idle && prep state */ - worker->flags = WORKER_PREP; - } - return worker; -} - -/** - * create_worker - create a new workqueue worker - * @gcwq: gcwq the new worker will belong to - * @bind: whether to set affinity to @cpu or not - * - * Create a new worker which is bound to @gcwq. The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). - * - * CONTEXT: - * Might sleep. Does GFP_KERNEL allocations. - * - * RETURNS: - * Pointer to the newly created worker. - */ -static struct worker *create_worker(struct global_cwq *gcwq, bool bind) -{ - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; - struct worker *worker = NULL; - int id = -1; - - spin_lock_irq(&gcwq->lock); - while (ida_get_new(&gcwq->worker_ida, &id)) { - spin_unlock_irq(&gcwq->lock); - if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) - goto fail; - spin_lock_irq(&gcwq->lock); - } - spin_unlock_irq(&gcwq->lock); - - worker = alloc_worker(); - if (!worker) - goto fail; - - worker->gcwq = gcwq; - worker->id = id; - - if (!on_unbound_cpu) - worker->task = kthread_create_on_node(worker_thread, - worker, - cpu_to_node(gcwq->cpu), - "kworker/%u:%d", gcwq->cpu, id); - else - worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d", id); - if (IS_ERR(worker->task)) - goto fail; - - /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. - */ - if (bind && !on_unbound_cpu) - kthread_bind(worker->task, gcwq->cpu); - else { - worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; - } - - return worker; -fail: - if (id >= 0) { - spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); - spin_unlock_irq(&gcwq->lock); - } - kfree(worker); - return NULL; -} - -/** - * start_worker - start a newly created worker - * @worker: worker to start - * - * Make the gcwq aware of @worker and start it. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void start_worker(struct worker *worker) -{ - worker->flags |= WORKER_STARTED; - worker->gcwq->nr_workers++; - worker_enter_idle(worker); - wake_up_process(worker->task); -} - -/** - * destroy_worker - destroy a workqueue worker - * @worker: worker to be destroyed - * - * Destroy @worker and adjust @gcwq stats accordingly. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. - */ -static void destroy_worker(struct worker *worker) -{ - struct global_cwq *gcwq = worker->gcwq; - int id = worker->id; - - /* sanity check frenzy */ - BUG_ON(worker->current_work); - BUG_ON(!list_empty(&worker->scheduled)); - - if (worker->flags & WORKER_STARTED) - gcwq->nr_workers--; - if (worker->flags & WORKER_IDLE) - gcwq->nr_idle--; - - list_del_init(&worker->entry); - worker->flags |= WORKER_DIE; - - spin_unlock_irq(&gcwq->lock); - - kthread_stop(worker->task); - kfree(worker); - - spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); -} - -static void idle_worker_timeout(unsigned long __gcwq) -{ - struct global_cwq *gcwq = (void *)__gcwq; - - spin_lock_irq(&gcwq->lock); - - if (too_many_workers(gcwq)) { - struct worker *worker; - unsigned long expires; - - /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) - mod_timer(&gcwq->idle_timer, expires); - else { - /* it's been idle for too long, wake up manager */ - gcwq->flags |= GCWQ_MANAGE_WORKERS; - wake_up_worker(gcwq); - } - } - - spin_unlock_irq(&gcwq->lock); -} - -static bool send_mayday(struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct workqueue_struct *wq = cwq->wq; - unsigned int cpu; - - if (!(wq->flags & WQ_RESCUER)) - return false; - - /* mayday mayday mayday */ - cpu = cwq->gcwq->cpu; - /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ - if (cpu == WORK_CPU_UNBOUND) - cpu = 0; - if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) - wake_up_process(wq->rescuer->task); - return true; -} - -static void gcwq_mayday_timeout(unsigned long __gcwq) -{ - struct global_cwq *gcwq = (void *)__gcwq; - struct work_struct *work; - - spin_lock_irq(&gcwq->lock); - - if (need_to_create_worker(gcwq)) { - /* - * We've been trying to create a new worker but - * haven't been successful. We might be hitting an - * allocation deadlock. Send distress signals to - * rescuers. - */ - list_for_each_entry(work, &gcwq->worklist, entry) - send_mayday(work); - } - - spin_unlock_irq(&gcwq->lock); - - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); -} - -/** - * maybe_create_worker - create a new worker if necessary - * @gcwq: gcwq to create a new worker for - * - * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to - * have at least one idle worker on return from this function. If - * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is - * sent to all rescuers with works scheduled on @gcwq to resolve - * possible allocation deadlock. - * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. - * - * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. Does GFP_KERNEL allocations. Called only from - * manager. - * - * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true - * otherwise. - */ -static bool maybe_create_worker(struct global_cwq *gcwq) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!need_to_create_worker(gcwq)) - return false; -restart: - spin_unlock_irq(&gcwq->lock); - - /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); - - while (true) { - struct worker *worker; - - worker = create_worker(gcwq, true); - if (worker) { - del_timer_sync(&gcwq->mayday_timer); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - BUG_ON(need_to_create_worker(gcwq)); - return true; - } - - if (!need_to_create_worker(gcwq)) - break; - - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CREATE_COOLDOWN); - - if (!need_to_create_worker(gcwq)) - break; - } - - del_timer_sync(&gcwq->mayday_timer); - spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) - goto restart; - return true; -} - -/** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @gcwq: gcwq to destroy workers for - * - * Destroy @gcwq workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. Called only from manager. - * - * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true - * otherwise. - */ -static bool maybe_destroy_workers(struct global_cwq *gcwq) -{ - bool ret = false; - - while (too_many_workers(gcwq)) { - struct worker *worker; - unsigned long expires; - - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) { - mod_timer(&gcwq->idle_timer, expires); - break; - } - - destroy_worker(worker); - ret = true; - } - - return ret; -} - -/** - * manage_workers - manage worker pool - * @worker: self - * - * Assume the manager role and manage gcwq worker pool @worker belongs - * to. At any given time, there can be only zero or one manager per - * gcwq. The exclusion is handled automatically by this function. - * - * The caller can safely start processing works on false return. On - * true return, it's guaranteed that need_to_create_worker() is false - * and may_start_working() is true. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. Does GFP_KERNEL allocations. - * - * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true if - * some action was taken. - */ -static bool manage_workers(struct worker *worker) -{ - struct global_cwq *gcwq = worker->gcwq; - bool ret = false; - - if (gcwq->flags & GCWQ_MANAGING_WORKERS) - return ret; - - gcwq->flags &= ~GCWQ_MANAGE_WORKERS; - gcwq->flags |= GCWQ_MANAGING_WORKERS; - - /* - * Destroy and then create so that may_start_working() is true - * on return. - */ - ret |= maybe_destroy_workers(gcwq); - ret |= maybe_create_worker(gcwq); - - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); - - return ret; -} - -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); - - trace_workqueue_activate_work(work); - move_linked_works(work, pos, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) -{ - /* ignore uncolored works */ - if (color == WORK_NO_COLOR) - return; - - cwq->nr_in_flight[color]--; - - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - } - - /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) - return; - - /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) - return; - - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; - - /* - * If this was the last cwq, wake up the first flusher. It - * will handle the rest. - */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); -} - -/** - * process_one_work - process single work - * @worker: self - * @work: work to process - * - * Process @work. This function contains all the logics necessary to - * process a single work including synchronization against and - * interaction with other workers on the same cpu, queueing and - * flushing. As long as context requirement is met, any worker can - * call this function to process a work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. - */ -static void process_one_work(struct worker *worker, struct work_struct *work) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct global_cwq *gcwq = cwq->gcwq; - struct hlist_head *bwh = busy_worker_head(gcwq, work); - bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; - work_func_t f = work->func; - int work_color; - struct worker *collision; -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the struct work_struct from - * inside the function that is called from it, this we need to - * take into account for lockdep too. To avoid bogus "held - * lock freed" warnings as well as problems when looking into - * work->lockdep_map, make a copy and use that here. - */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - /* - * A single work shouldn't be executed concurrently by - * multiple workers on a single cpu. Check whether anyone is - * already processing the work. If so, defer the work to the - * currently executing one. - */ - collision = __find_worker_executing_work(gcwq, bwh, work); - if (unlikely(collision)) { - move_linked_works(work, &collision->scheduled, NULL); - return; - } - - /* claim and process */ - debug_work_deactivate(work); - hlist_add_head(&worker->hentry, bwh); - worker->current_work = work; - worker->current_cwq = cwq; - work_color = get_work_color(work); - - /* record the current cpu number in the work data and dequeue */ - set_work_cpu(work, gcwq->cpu); - list_del_init(&work->entry); - - /* - * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, - * wake up another worker; otherwise, clear HIGHPRI_PENDING. - */ - if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&gcwq->worklist, - struct work_struct, entry); - - if (!list_empty(&gcwq->worklist) && - get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(gcwq); - else - gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; - } - - /* - * CPU intensive works don't participate in concurrency - * management. They're the scheduler's responsibility. - */ - if (unlikely(cpu_intensive)) - worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); - - spin_unlock_irq(&gcwq->lock); - - work_clear_pending(work); - lock_map_acquire_read(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - trace_workqueue_execute_start(work); - f(work); - /* - * While we must be careful to not use "work" after this, the trace - * point will only record its address. - */ - trace_workqueue_execute_end(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); - } - - spin_lock_irq(&gcwq->lock); - - /* clear cpu intensive status */ - if (unlikely(cpu_intensive)) - worker_clr_flags(worker, WORKER_CPU_INTENSIVE); - - /* we're done with it, release */ - hlist_del_init(&worker->hentry); - worker->current_work = NULL; - worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color, false); -} - -/** - * process_scheduled_works - process scheduled works - * @worker: self - * - * Process all scheduled works. Please note that the scheduled list - * may change while processing a work, so this function repeatedly - * fetches a work from the top and executes it. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. - */ -static void process_scheduled_works(struct worker *worker) -{ - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); - process_one_work(worker, work); - } -} - -/** - * worker_thread - the worker thread function - * @__worker: self - * - * The gcwq worker thread function. There's a single dynamic pool of - * these per each cpu. These workers process all works regardless of - * their specific target workqueue. The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). - */ -static int worker_thread(void *__worker) -{ - struct worker *worker = __worker; - struct global_cwq *gcwq = worker->gcwq; - - /* tell the scheduler that this is a workqueue worker */ - worker->task->flags |= PF_WQ_WORKER; -woke_up: - spin_lock_irq(&gcwq->lock); - - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { - spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; - } - - worker_leave_idle(worker); -recheck: - /* no more worker necessary? */ - if (!need_more_worker(gcwq)) - goto sleep; - - /* do we need to manage? */ - if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) - goto recheck; - - /* - * ->scheduled list can only be filled while a worker is - * preparing to process a work or actually processing it. - * Make sure nobody diddled with it while I was sleeping. - */ - BUG_ON(!list_empty(&worker->scheduled)); - - /* - * When control reaches this point, we're guaranteed to have - * at least one idle worker or that someone else has already - * assumed the manager role. - */ - worker_clr_flags(worker, WORKER_PREP); - - do { - struct work_struct *work = - list_first_entry(&gcwq->worklist, - struct work_struct, entry); - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); - process_scheduled_works(worker); - } - } while (keep_working(gcwq)); - - worker_set_flags(worker, WORKER_PREP, false); -sleep: - if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) - goto recheck; - - /* - * gcwq->lock is held and there's no work to process and no - * need to manage, sleep. Workers are woken up only while - * holding gcwq->lock or from local cpu, so setting the - * current state before releasing gcwq->lock is enough to - * prevent losing any event. - */ - worker_enter_idle(worker); - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&gcwq->lock); - schedule(); - goto woke_up; -} - -/** - * rescuer_thread - the rescuer thread function - * @__wq: the associated workqueue - * - * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. - * - * Regular work processing on a gcwq may block trying to create a new - * worker which uses GFP_KERNEL allocation which has slight chance of - * developing into deadlock if some works currently on the same queue - * need to be processed to satisfy the GFP_KERNEL allocation. This is - * the problem rescuer solves. - * - * When such condition is possible, the gcwq summons rescuers of all - * workqueues which have works queued on the gcwq and let them process - * those works so that forward progress can be guaranteed. - * - * This should happen rarely. - */ -static int rescuer_thread(void *__wq) -{ - struct workqueue_struct *wq = __wq; - struct worker *rescuer = wq->rescuer; - struct list_head *scheduled = &rescuer->scheduled; - bool is_unbound = wq->flags & WQ_UNBOUND; - unsigned int cpu; - - set_user_nice(current, RESCUER_NICE_LEVEL); -repeat: - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) - return 0; - - /* - * See whether any cpu is asking for help. Unbounded - * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. - */ - for_each_mayday_cpu(cpu, wq->mayday_mask) { - unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct global_cwq *gcwq = cwq->gcwq; - struct work_struct *work, *n; - - __set_current_state(TASK_RUNNING); - mayday_clear_cpu(cpu, wq->mayday_mask); - - /* migrate to the target cpu if possible */ - rescuer->gcwq = gcwq; - worker_maybe_bind_and_lock(rescuer); - - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ - BUG_ON(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &gcwq->worklist, entry) - if (get_work_cwq(work) == cwq) - move_linked_works(work, scheduled, &n); - - process_scheduled_works(rescuer); - - /* - * Leave this gcwq. If keep_working() is %true, notify a - * regular worker; otherwise, we end up with 0 concurrency - * and stalling the execution. - */ - if (keep_working(gcwq)) - wake_up_worker(gcwq); - - spin_unlock_irq(&gcwq->lock); - } - - schedule(); - goto repeat; -} - -struct wq_barrier { - struct work_struct work; - struct completion done; -}; - -static void wq_barrier_func(struct work_struct *work) -{ - struct wq_barrier *barr = container_of(work, struct wq_barrier, work); - complete(&barr->done); -} - -/** - * insert_wq_barrier - insert a barrier work - * @cwq: cwq to insert barrier into - * @barr: wq_barrier to insert - * @target: target work to attach @barr to - * @worker: worker currently executing @target, NULL if @target is not executing - * - * @barr is linked to @target such that @barr is completed only after - * @target finishes execution. Please note that the ordering - * guarantee is observed only with respect to @target and on the local - * cpu. - * - * Currently, a queued barrier can't be canceled. This is because - * try_to_grab_pending() can't determine whether the work to be - * grabbed is at the head of the queue and thus can't clear LINKED - * flag of the previous work while there must be a valid next work - * after a work with LINKED flag set. - * - * Note that when @worker is non-NULL, @target may be modified - * underneath us, so we can't reliably determine cwq from @target. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, - struct work_struct *target, struct worker *worker) -{ - struct list_head *head; - unsigned int linked = 0; - - /* - * debugobject calls are safe here even with gcwq->lock locked - * as we know for sure that this will not trigger any of the - * checks and call back into the fixup functions where we - * might deadlock. - */ - INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - init_completion(&barr->done); - - /* - * If @target is currently being executed, schedule the - * barrier to the worker; otherwise, put it after @target. - */ - if (worker) - head = worker->scheduled.next; - else { - unsigned long *bits = work_data_bits(target); - - head = target->entry.next; - /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; - __set_bit(WORK_STRUCT_LINKED_BIT, bits); - } - - debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); -} - -/** - * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing - * @wq: workqueue being flushed - * @flush_color: new flush color, < 0 for no-op - * @work_color: new work color, < 0 for no-op - * - * Prepare cwqs for workqueue flushing. - * - * If @flush_color is non-negative, flush_color on all cwqs should be - * -1. If no cwq has in-flight commands at the specified color, all - * cwq->flush_color's stay at -1 and %false is returned. If any cwq - * has in flight commands, its cwq->flush_color is set to - * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq - * wakeup logic is armed and %true is returned. - * - * The caller should have initialized @wq->first_flusher prior to - * calling this function with non-negative @flush_color. If - * @flush_color is negative, no flush color update is done and %false - * is returned. - * - * If @work_color is non-negative, all cwqs should have the same - * work_color which is previous to @work_color and all will be - * advanced to @work_color. - * - * CONTEXT: - * mutex_lock(wq->flush_mutex). - * - * RETURNS: - * %true if @flush_color >= 0 and there's something to flush. %false - * otherwise. - */ -static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, - int flush_color, int work_color) -{ - bool wait = false; - unsigned int cpu; - - if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); - atomic_set(&wq->nr_cwqs_to_flush, 1); - } - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->gcwq; - - spin_lock_irq(&gcwq->lock); - - if (flush_color >= 0) { - BUG_ON(cwq->flush_color != -1); - - if (cwq->nr_in_flight[flush_color]) { - cwq->flush_color = flush_color; - atomic_inc(&wq->nr_cwqs_to_flush); - wait = true; - } - } - - if (work_color >= 0) { - BUG_ON(work_color != work_next_color(cwq->work_color)); - cwq->work_color = work_color; - } - - spin_unlock_irq(&gcwq->lock); - } - - if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) - complete(&wq->first_flusher->done); - - return wait; -} - -/** - * flush_workqueue - ensure that any scheduled work has run to completion. - * @wq: workqueue to flush - * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. - */ -void flush_workqueue(struct workqueue_struct *wq) -{ - struct wq_flusher this_flusher = { - .list = LIST_HEAD_INIT(this_flusher.list), - .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), - }; - int next_color; - - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - - mutex_lock(&wq->flush_mutex); - - /* - * Start-to-wait phase - */ - next_color = work_next_color(wq->work_color); - - if (next_color != wq->flush_color) { - /* - * Color space is not full. The current work_color - * becomes our flush_color and work_color is advanced - * by one. - */ - BUG_ON(!list_empty(&wq->flusher_overflow)); - this_flusher.flush_color = wq->work_color; - wq->work_color = next_color; - - if (!wq->first_flusher) { - /* no flush in progress, become the first flusher */ - BUG_ON(wq->flush_color != this_flusher.flush_color); - - wq->first_flusher = &this_flusher; - - if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, - wq->work_color)) { - /* nothing to flush, done */ - wq->flush_color = next_color; - wq->first_flusher = NULL; - goto out_unlock; - } - } else { - /* wait in queue */ - BUG_ON(wq->flush_color == this_flusher.flush_color); - list_add_tail(&this_flusher.list, &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); - } - } else { - /* - * Oops, color space is full, wait on overflow queue. - * The next flush completion will assign us - * flush_color and transfer to flusher_queue. - */ - list_add_tail(&this_flusher.list, &wq->flusher_overflow); - } - - mutex_unlock(&wq->flush_mutex); - - wait_for_completion(&this_flusher.done); - - /* - * Wake-up-and-cascade phase - * - * First flushers are responsible for cascading flushes and - * handling overflow. Non-first flushers can simply return. - */ - if (wq->first_flusher != &this_flusher) - return; - - mutex_lock(&wq->flush_mutex); - - /* we might have raced, check again with mutex held */ - if (wq->first_flusher != &this_flusher) - goto out_unlock; - - wq->first_flusher = NULL; - - BUG_ON(!list_empty(&this_flusher.list)); - BUG_ON(wq->flush_color != this_flusher.flush_color); - - while (true) { - struct wq_flusher *next, *tmp; - - /* complete all the flushers sharing the current flush color */ - list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { - if (next->flush_color != wq->flush_color) - break; - list_del_init(&next->list); - complete(&next->done); - } - - BUG_ON(!list_empty(&wq->flusher_overflow) && - wq->flush_color != work_next_color(wq->work_color)); - - /* this flush_color is finished, advance by one */ - wq->flush_color = work_next_color(wq->flush_color); - - /* one color has been freed, handle overflow queue */ - if (!list_empty(&wq->flusher_overflow)) { - /* - * Assign the same color to all overflowed - * flushers, advance work_color and append to - * flusher_queue. This is the start-to-wait - * phase for these overflowed flushers. - */ - list_for_each_entry(tmp, &wq->flusher_overflow, list) - tmp->flush_color = wq->work_color; - - wq->work_color = work_next_color(wq->work_color); - - list_splice_tail_init(&wq->flusher_overflow, - &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); - } - - if (list_empty(&wq->flusher_queue)) { - BUG_ON(wq->flush_color != wq->work_color); - break; - } - - /* - * Need to flush more colors. Make the next flusher - * the new first flusher and arm cwqs. - */ - BUG_ON(wq->flush_color == wq->work_color); - BUG_ON(wq->flush_color != next->flush_color); - - list_del_init(&next->list); - wq->first_flusher = next; - - if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) - break; - - /* - * Meh... this color is already done, clear first - * flusher and repeat cascading. - */ - wq->first_flusher = NULL; - } - -out_unlock: - mutex_unlock(&wq->flush_mutex); -} -EXPORT_SYMBOL_GPL(flush_workqueue); - -/** - * drain_workqueue - drain a workqueue - * @wq: workqueue to drain - * - * Wait until the workqueue becomes empty. While draining is in progress, - * only chain queueing is allowed. IOW, only currently pending or running - * work items on @wq can queue further work items on it. @wq is flushed - * repeatedly until it becomes empty. The number of flushing is detemined - * by the depth of chaining and should be relatively short. Whine if it - * takes too long. - */ -void drain_workqueue(struct workqueue_struct *wq) -{ - unsigned int flush_cnt = 0; - unsigned int cpu; - - /* - * __queue_work() needs to test whether there are drainers, is much - * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. - */ - spin_lock(&workqueue_lock); - if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; - spin_unlock(&workqueue_lock); -reflush: - flush_workqueue(wq); - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - bool drained; - - spin_lock_irq(&cwq->gcwq->lock); - drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->gcwq->lock); - - if (drained) - continue; - - if (++flush_cnt == 10 || - (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", - wq->name, flush_cnt); - goto reflush; - } - - spin_lock(&workqueue_lock); - if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; - spin_unlock(&workqueue_lock); -} -EXPORT_SYMBOL_GPL(drain_workqueue); - -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, - bool wait_executing) -{ - struct worker *worker = NULL; - struct global_cwq *gcwq; - struct cpu_workqueue_struct *cwq; - - might_sleep(); - gcwq = get_work_gcwq(work); - if (!gcwq) - return false; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued to a different gcwq under us, we - * are not going to wait. - */ - smp_rmb(); - cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->gcwq)) - goto already_gone; - } else if (wait_executing) { - worker = find_worker_executing_work(gcwq, work); - if (!worker) - goto already_gone; - cwq = worker->current_cwq; - } else - goto already_gone; - - insert_wq_barrier(cwq, barr, work, worker); - spin_unlock_irq(&gcwq->lock); - - /* - * If @max_active is 1 or rescuer is in use, flushing another work - * item on the same workqueue may lead to deadlock. Make sure the - * flusher is not running on the same workqueue by verifying write - * access. - */ - if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) - lock_map_acquire(&cwq->wq->lockdep_map); - else - lock_map_acquire_read(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - return true; -already_gone: - spin_unlock_irq(&gcwq->lock); - return false; -} - -/** - * flush_work - wait for a work to finish executing the last queueing instance - * @work: the work to flush - * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. - * - * RETURNS: - * %true if flush_work() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr, true)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else - return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else - return false; -} - -static bool wait_on_work(struct work_struct *work) -{ - bool ret = false; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - ret |= wait_on_cpu_work(get_gcwq(cpu), work); - return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution. On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished. In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ - struct wq_barrier barr; - bool pending, waited; - - /* we'll wait for executions separately, queue barr only if pending */ - pending = start_flush_work(work, &barr, false); - - /* wait for executions to finish */ - waited = wait_on_work(work); - - /* wait for the pending one */ - if (pending) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - } - - return pending || waited; -} -EXPORT_SYMBOL_GPL(flush_work_sync); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct global_cwq *gcwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - gcwq = get_work_gcwq(work); - if (!gcwq) - return ret; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; - } - } - spin_unlock_irq(&gcwq->lock); - - return ret; -} - -static bool __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) -{ - int ret; - - do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); - } while (unlikely(ret < 0)); - - clear_work_data(work); - return ret; -} - -/** - * cancel_work_sync - cancel a work and wait for it to finish - * @work: the work to cancel - * - * Cancel @work and wait for its execution to finish. This function - * can be used even if the work re-queues itself or migrates to - * another workqueue. On return from this function, @work is - * guaranteed to be not pending or executing on any CPU. - * - * cancel_work_sync(&delayed_work->work) must not be used for - * delayed_work's. Use cancel_delayed_work_sync() instead. - * - * The caller must ensure that the workqueue on which @work was last - * queued can't be destroyed before this function returns. - * - * RETURNS: - * %true if @work was pending, %false otherwise. - */ -bool cancel_work_sync(struct work_struct *work) -{ - return __cancel_work_timer(work, NULL); -} -EXPORT_SYMBOL_GPL(cancel_work_sync); - -/** - * flush_delayed_work - wait for a dwork to finish executing the last queueing - * @dwork: the delayed work to flush - * - * Delayed timer is cancelled and the pending work is queued for - * immediate execution. Like flush_work(), this function only - * considers the last queueing instance of @dwork. - * - * RETURNS: - * %true if flush_work() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_delayed_work(struct delayed_work *dwork) -{ - if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), - get_work_cwq(&dwork->work)->wq, &dwork->work); - return flush_work(&dwork->work); -} -EXPORT_SYMBOL(flush_delayed_work); - -/** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush - * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately. Other than timer handling, its behavior - * is identical to flush_work_sync(). - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_delayed_work_sync(struct delayed_work *dwork) -{ - if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), - get_work_cwq(&dwork->work)->wq, &dwork->work); - return flush_work_sync(&dwork->work); -} -EXPORT_SYMBOL(flush_delayed_work_sync); - -/** - * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish - * @dwork: the delayed work cancel - * - * This is cancel_work_sync() for delayed works. - * - * RETURNS: - * %true if @dwork was pending, %false otherwise. - */ -bool cancel_delayed_work_sync(struct delayed_work *dwork) -{ - return __cancel_work_timer(&dwork->work, &dwork->timer); -} -EXPORT_SYMBOL(cancel_delayed_work_sync); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -int schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_on_each_cpu - execute a function synchronously on each online CPU - * @func: the function to call - * - * schedule_on_each_cpu() executes @func on each online CPU using the - * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int schedule_on_each_cpu(work_func_t func) -{ - int cpu; - struct work_struct __percpu *works; - - works = alloc_percpu(struct work_struct); - if (!works) - return -ENOMEM; - - get_online_cpus(); - - for_each_online_cpu(cpu) { - struct work_struct *work = per_cpu_ptr(works, cpu); - - INIT_WORK(work, func); - schedule_work_on(cpu, work); - } - - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); - - put_online_cpus(); - free_percpu(works); - return 0; -} - -/** - * flush_scheduled_work - ensure that any scheduled work has run to completion. - * - * Forces execution of the kernel-global workqueue and blocks until its - * completion. - * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: - * - * One of the work items currently on the workqueue needs to acquire - * a lock held by your code or its caller. - * - * Your code is running in the context of a work routine. - * - * They will be detected by lockdep when they occur, but the first might not - * occur very often. It depends on what work items are on the workqueue and - * what locks they need, which you have no control over. - * - * In most situations flushing the entire workqueue is overkill; you merely - * need to know that a particular work item isn't queued and isn't running. - * In such cases you should use cancel_delayed_work_sync() or - * cancel_work_sync() instead. - */ -void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} -EXPORT_SYMBOL(flush_scheduled_work); - -/** - * execute_in_process_context - reliably execute the routine with user context - * @fn: the function to execute - * @ew: guaranteed storage for the execute work structure (must - * be available when the work executes) - * - * Executes the function immediately if process context is available, - * otherwise schedules the function for delayed execution. - * - * Returns: 0 - function was executed - * 1 - function was scheduled for execution - */ -int execute_in_process_context(work_func_t fn, struct execute_work *ew) -{ - if (!in_interrupt()) { - fn(&ew->work); - return 0; - } - - INIT_WORK(&ew->work, fn); - schedule_work(&ew->work); - - return 1; -} -EXPORT_SYMBOL_GPL(execute_in_process_context); - -int keventd_up(void) -{ - return system_wq != NULL; -} - -static int alloc_cwqs(struct workqueue_struct *wq) -{ - /* - * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. - * Make sure that the alignment isn't lower than that of - * unsigned long long. - */ - const size_t size = sizeof(struct cpu_workqueue_struct); - const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, - __alignof__(unsigned long long)); -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) - wq->cpu_wq.pcpu = __alloc_percpu(size, align); - else { - void *ptr; - - /* - * Allocate enough room to align cwq and put an extra - * pointer at the end pointing back to the originally - * allocated pointer which will be used for free. - */ - ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); - if (ptr) { - wq->cpu_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->cpu_wq.single + 1) = ptr; - } - } - - /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); - return wq->cpu_wq.v ? 0 : -ENOMEM; -} - -static void free_cwqs(struct workqueue_struct *wq) -{ -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) - free_percpu(wq->cpu_wq.pcpu); - else if (wq->cpu_wq.single) { - /* the pointer to free is stored right after the cwq */ - kfree(*(void **)(wq->cpu_wq.single + 1)); - } -} - -static int wq_clamp_max_active(int max_active, unsigned int flags, - const char *name) -{ - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active > lim) - printk(KERN_WARNING "workqueue: max_active %d requested for %s " - "is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); - - return clamp_val(max_active, 1, lim); -} - -struct workqueue_struct *__alloc_workqueue_key(const char *fmt, - unsigned int flags, - int max_active, - struct lock_class_key *key, - const char *lock_name, ...) -{ - va_list args, args1; - struct workqueue_struct *wq; - unsigned int cpu; - size_t namelen; - - /* determine namelen, allocate wq and format name */ - va_start(args, lock_name); - va_copy(args1, args); - namelen = vsnprintf(NULL, 0, fmt, args) + 1; - - wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); - if (!wq) - goto err; - - vsnprintf(wq->name, namelen, fmt, args1); - va_end(args); - va_end(args1); - - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; - - /* - * Unbound workqueues aren't concurrency managed and should be - * dispatched to workers immediately. - */ - if (flags & WQ_UNBOUND) - flags |= WQ_HIGHPRI; - - max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, wq->name); - - /* init wq */ - wq->flags = flags; - wq->saved_max_active = max_active; - mutex_init(&wq->flush_mutex); - atomic_set(&wq->nr_cwqs_to_flush, 0); - INIT_LIST_HEAD(&wq->flusher_queue); - INIT_LIST_HEAD(&wq->flusher_overflow); - - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - INIT_LIST_HEAD(&wq->list); - - if (alloc_cwqs(wq) < 0) - goto err; - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = get_gcwq(cpu); - - BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->gcwq = gcwq; - cwq->wq = wq; - cwq->flush_color = -1; - cwq->max_active = max_active; - INIT_LIST_HEAD(&cwq->delayed_works); - } - - if (flags & WQ_RESCUER) { - struct worker *rescuer; - - if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) - goto err; - - wq->rescuer = rescuer = alloc_worker(); - if (!rescuer) - goto err; - - rescuer->task = kthread_create(rescuer_thread, wq, "%s", - wq->name); - if (IS_ERR(rescuer->task)) - goto err; - - rescuer->task->flags |= PF_THREAD_BOUND; - wake_up_process(rescuer->task); - } - - /* - * workqueue_lock protects global freeze state and workqueues - * list. Grab it, set max_active accordingly and add the new - * workqueue to workqueues list. - */ - spin_lock(&workqueue_lock); - - if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_cwq_cpu(cpu, wq) - get_cwq(cpu, wq)->max_active = 0; - - list_add(&wq->list, &workqueues); - - spin_unlock(&workqueue_lock); - - return wq; -err: - if (wq) { - free_cwqs(wq); - free_mayday_mask(wq->mayday_mask); - kfree(wq->rescuer); - kfree(wq); - } - return NULL; -} -EXPORT_SYMBOL_GPL(__alloc_workqueue_key); - -/** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue - * - * Safely destroy a workqueue. All work currently pending will be done first. - */ -void destroy_workqueue(struct workqueue_struct *wq) -{ - unsigned int cpu; - - /* drain it before proceeding with destruction */ - drain_workqueue(wq); - - /* - * wq list is used to freeze wq, remove from list after - * flushing is complete in case freeze races us. - */ - spin_lock(&workqueue_lock); - list_del(&wq->list); - spin_unlock(&workqueue_lock); - - /* sanity check */ - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - int i; - - for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(cwq->nr_in_flight[i]); - BUG_ON(cwq->nr_active); - BUG_ON(!list_empty(&cwq->delayed_works)); - } - - if (wq->flags & WQ_RESCUER) { - kthread_stop(wq->rescuer->task); - free_mayday_mask(wq->mayday_mask); - kfree(wq->rescuer); - } - - free_cwqs(wq); - kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -/** - * workqueue_set_max_active - adjust max_active of a workqueue - * @wq: target workqueue - * @max_active: new max_active value. - * - * Set max_active of @wq to @max_active. - * - * CONTEXT: - * Don't call from IRQ context. - */ -void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) -{ - unsigned int cpu; - - max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - - spin_lock(&workqueue_lock); - - wq->saved_max_active = max_active; - - for_each_cwq_cpu(cpu, wq) { - struct global_cwq *gcwq = get_gcwq(cpu); - - spin_lock_irq(&gcwq->lock); - - if (!(wq->flags & WQ_FREEZABLE) || - !(gcwq->flags & GCWQ_FREEZING)) - get_cwq(gcwq->cpu, wq)->max_active = max_active; - - spin_unlock_irq(&gcwq->lock); - } - - spin_unlock(&workqueue_lock); -} -EXPORT_SYMBOL_GPL(workqueue_set_max_active); - -/** - * workqueue_congested - test whether a workqueue is congested - * @cpu: CPU in question - * @wq: target workqueue - * - * Test whether @wq's cpu workqueue for @cpu is congested. There is - * no synchronization around this function and the test result is - * unreliable and only useful as advisory hints or for debugging. - * - * RETURNS: - * %true if congested, %false otherwise. - */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) -{ - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - return !list_empty(&cwq->delayed_works); -} -EXPORT_SYMBOL_GPL(workqueue_congested); - -/** - * work_cpu - return the last known associated cpu for @work - * @work: the work of interest - * - * RETURNS: - * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. - */ -unsigned int work_cpu(struct work_struct *work) -{ - struct global_cwq *gcwq = get_work_gcwq(work); - - return gcwq ? gcwq->cpu : WORK_CPU_NONE; -} -EXPORT_SYMBOL_GPL(work_cpu); - -/** - * work_busy - test whether a work is currently pending or running - * @work: the work to be tested - * - * Test whether @work is currently pending or running. There is no - * synchronization around this function and the test result is - * unreliable and only useful as advisory hints or for debugging. - * Especially for reentrant wqs, the pending state might hide the - * running state. - * - * RETURNS: - * OR'd bitmask of WORK_BUSY_* bits. - */ -unsigned int work_busy(struct work_struct *work) -{ - struct global_cwq *gcwq = get_work_gcwq(work); - unsigned long flags; - unsigned int ret = 0; - - if (!gcwq) - return false; - - spin_lock_irqsave(&gcwq->lock, flags); - - if (work_pending(work)) - ret |= WORK_BUSY_PENDING; - if (find_worker_executing_work(gcwq, work)) - ret |= WORK_BUSY_RUNNING; - - spin_unlock_irqrestore(&gcwq->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(work_busy); - -/* - * CPU hotplug. - * - * There are two challenges in supporting CPU hotplug. Firstly, there - * are a lot of assumptions on strong associations among work, cwq and - * gcwq which make migrating pending and scheduled works very - * difficult to implement without impacting hot paths. Secondly, - * gcwqs serve mix of short, long and very long running works making - * blocked draining impractical. - * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- - */ - -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) - -static int __cpuinit trustee_thread(void *__gcwq) -{ - struct global_cwq *gcwq = __gcwq; - struct worker *worker; - struct work_struct *work; - struct hlist_node *pos; - long rc; - int i; - - BUG_ON(gcwq->cpu != smp_processor_id()); - - spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); - BUG_ON(rc < 0); - - gcwq->flags |= GCWQ_MANAGING_WORKERS; - - list_for_each_entry(worker, &gcwq->idle_list, entry) - worker->flags |= WORKER_ROGUE; - - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; - - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ - spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); - - /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. - */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); - - spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); - spin_lock_irq(&gcwq->lock); - - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); - - /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. - */ - while (gcwq->nr_workers != gcwq->nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { - int nr_works = 0; - - list_for_each_entry(work, &gcwq->worklist, entry) { - send_mayday(work); - nr_works++; - } - - list_for_each_entry(worker, &gcwq->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } - - if (need_to_create_worker(gcwq)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); - } - } - - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } - - /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. - */ - do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, - struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); - - /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. - */ - WARN_ON(!list_empty(&gcwq->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; -} - -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. - */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } -} - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); - unsigned long flags; - - action &= ~CPU_TASKS_FROZEN; - - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; - } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); - - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; - - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; - - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - } - - spin_unlock_irqrestore(&gcwq->lock, flags); - - return notifier_from_errno(0); -} - -#ifdef CONFIG_SMP - -struct work_for_cpu { - struct completion completion; - long (*fn)(void *); - void *arg; - long ret; -}; - -static int do_work_for_cpu(void *_wfc) -{ - struct work_for_cpu *wfc = _wfc; - wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; -} - -/** - * work_on_cpu - run a function in user context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function arg - * - * This will return the value @fn returns. - * It is up to the caller to ensure that the cpu doesn't go offline. - * The caller must not hold any locks which would prevent @fn from completing. - */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) -{ - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; - - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); - return wfc.ret; -} -EXPORT_SYMBOL_GPL(work_on_cpu); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_FREEZER - -/** - * freeze_workqueues_begin - begin freezing workqueues - * - * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of - * gcwq->worklist. - * - * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. - */ -void freeze_workqueues_begin(void) -{ - unsigned int cpu; - - spin_lock(&workqueue_lock); - - BUG_ON(workqueue_freezing); - workqueue_freezing = true; - - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - struct workqueue_struct *wq; - - spin_lock_irq(&gcwq->lock); - - BUG_ON(gcwq->flags & GCWQ_FREEZING); - gcwq->flags |= GCWQ_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (cwq && wq->flags & WQ_FREEZABLE) - cwq->max_active = 0; - } - - spin_unlock_irq(&gcwq->lock); - } - - spin_unlock(&workqueue_lock); -} - -/** - * freeze_workqueues_busy - are freezable workqueues still busy? - * - * Check whether freezing is complete. This function must be called - * between freeze_workqueues_begin() and thaw_workqueues(). - * - * CONTEXT: - * Grabs and releases workqueue_lock. - * - * RETURNS: - * %true if some freezable workqueues are still busy. %false if freezing - * is complete. - */ -bool freeze_workqueues_busy(void) -{ - unsigned int cpu; - bool busy = false; - - spin_lock(&workqueue_lock); - - BUG_ON(!workqueue_freezing); - - for_each_gcwq_cpu(cpu) { - struct workqueue_struct *wq; - /* - * nr_active is monotonically decreasing. It's safe - * to peek without lock. - */ - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (!cwq || !(wq->flags & WQ_FREEZABLE)) - continue; - - BUG_ON(cwq->nr_active < 0); - if (cwq->nr_active) { - busy = true; - goto out_unlock; - } - } - } -out_unlock: - spin_unlock(&workqueue_lock); - return busy; -} - -/** - * thaw_workqueues - thaw workqueues - * - * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective gcwq worklists. - * - * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. - */ -void thaw_workqueues(void) -{ - unsigned int cpu; - - spin_lock(&workqueue_lock); - - if (!workqueue_freezing) - goto out_unlock; - - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - struct workqueue_struct *wq; - - spin_lock_irq(&gcwq->lock); - - BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); - gcwq->flags &= ~GCWQ_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (!cwq || !(wq->flags & WQ_FREEZABLE)) - continue; - - /* restore max_active and repopulate worklist */ - cwq->max_active = wq->saved_max_active; - - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - - wake_up_worker(gcwq); - - spin_unlock_irq(&gcwq->lock); - } - - workqueue_freezing = false; -out_unlock: - spin_unlock(&workqueue_lock); -} -#endif /* CONFIG_FREEZER */ - -static int __init init_workqueues(void) -{ - unsigned int cpu; - int i; - - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); - - /* initialize gcwqs */ - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - - spin_lock_init(&gcwq->lock); - INIT_LIST_HEAD(&gcwq->worklist); - gcwq->cpu = cpu; - gcwq->flags |= GCWQ_DISASSOCIATED; - - INIT_LIST_HEAD(&gcwq->idle_list); - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) - INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - - init_timer_deferrable(&gcwq->idle_timer); - gcwq->idle_timer.function = idle_worker_timeout; - gcwq->idle_timer.data = (unsigned long)gcwq; - - setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, - (unsigned long)gcwq); - - ida_init(&gcwq->worker_ida); - - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); - } - - /* create the initial worker */ - for_each_online_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; - - if (cpu != WORK_CPU_UNBOUND) - gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(gcwq, true); - BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - spin_unlock_irq(&gcwq->lock); - } - - system_wq = alloc_workqueue("events", 0, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); - system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); - system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_UNBOUND_MAX_ACTIVE); - system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); - system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", - WQ_NON_REENTRANT | WQ_FREEZABLE, 0); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); - return 0; -} -early_initcall(init_workqueues); diff --git a/introduction/introduction.tex b/introduction/introduction.tex new file mode 100644 index 0000000..726b054 --- /dev/null +++ b/introduction/introduction.tex @@ -0,0 +1,57 @@ +\section{Introduction} +\label{Introduction} + +%The main task for this group coursework is to build a DIFF tool together with a PATCH tool that is copy and move aware. +%\href{http://www.gnu.org/software/diffutils/}{GNU DIFF} works line by line and determines the differences between two text files, producing a sequence of commands that can be saved to a DIFF file. +%\href{http://www.gnu.org/software/diffutils/}{GNU PATCH} can then read these commands and apply them to the first file, which will recreate the second file input to DIFF. +%Commands either copy an ordered set of lines from the first file, delete an ordered set of lines from the first file, or insert a new set of lines. +% +%An issue with this is that if a line is copied numerous times throughout the second file, a command must insert it every time. +%This is inefficient, and can result in a large DIFF file. +%%Specific line sequences are either inserted or deleted by a command. +%%DIFF identifies an insertion and deletion in the same area as a change of line sequence. +%% +%%Our DIFF tool works differently than the known DIFF. +%%It has the following new features such as finding the longest segments that are copied or moved from one file to the other file. +%%It shows lines that are in the new version, but not in the old version. +%%It shows if lines are in the new version that is also present in the old. +%%But instead of showing the lines, it shows where the lines in the old files are. +% +%PATCH tool is the automated process of committing or applying of changes in files. +%This is necessary as many people can work on different copies of the same file. +%The parts they changed are written in the form of DIFF output. +%Hence, PATCH tool takes a DIFF output and applies its commands on a file. +%The PATCH commands are insert, delete and change lines. +% +%PATCH tool example: A file is created called DIFFS that includes the differences between two files, File1 and File2 as DIFF output. +%This file can be shared with other people now. +%Then, they can decide whether they want to commit the changes from copy of File1 or not. +%After they commit the change, PATCH applies the changes in DIFFS on File2. +%Hence, both File1 and File2 are identical now. +%If the DIFF output includes changes from several files, PATCH can process and apply them. +% +%Our PATCH tool works similar to the above mentioned known PATCH tool. +%But it uses an old version and an output of our DIFF tool to produce the new version from it. +%Hence, this DIFF/PATCH tool can be used to determine and distribute changes easily and effectively. +% +%In this report, the requirements of this tool are identified. +%UML diagrams are included that shows the architecture of the detector. +%The description about the implementation of this tool, how it works and the testing performed are included. +%Finally, the results produced by this tool are evaluated including the main things learnt from this coursework. + +\href{http://www.gnu.org/software/diffutils/}{GNU DIFF} is a tool that allows a programmer to identify the differences between two files, and outputs a list of commands to transform the first into the second. +These commands can be saved and used as input to GNU PATCH, together with the first file, in order to perform the transformation. +This is basic change control, and it revolutionised both single and multi-user programming, allowing file differences to be exchanged as an alternative to complete modified files. + +There are three DIFF commands: copy a set of lines from the first file, delete a set of lines from the first file, or insert a set of lines. +The commands in the patch file need to be applied in order, otherwise the transformation will be incorrect. Moreover, due to the nature of the instructions, every line in the original file needs to be handled explicitly, resulting in more instructions and hence longer patch files. + +Clone detection is the process of identifying matching sections of text between two files. +Much research is done in the field to enhance its application in areas such as plagiarism detection and intellectual property theft. +As a result, much faster methods are available today to detect matching sections of text than were available when GNU DIFF was designed. +In particular, the Suffix Tree method allows for an extremely fast detection algorithm. + +Our aim is to combine these two concepts, and create a new pair of tools: \texttt{diffr} and \texttt{patchr}. +These tools will be copy and move aware, and support out-of-order clone detection. +As a result,only two commands are necessary: copy a set of ordered lines from the first file, or insert a set of lines. +The tools will be extremely efficient, and the output from \texttt{diffr} will contain the bare minimum that is needed to articulate the set of differences between the files. diff --git a/patch/pom.xml b/patch/pom.xml deleted file mode 100644 index 4f34f67..0000000 --- a/patch/pom.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - patch - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Patch for diffr. - - - - org.testng - testng - ${testng.version} - test - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - test - - - - diffr - util - ${current.version} - - - diffr - util - ${current.version} - test - tests - - - diff --git a/patch/src/main/java/diffr/patch/IllegalPatchFileException.java b/patch/src/main/java/diffr/patch/IllegalPatchFileException.java deleted file mode 100644 index 2074c39..0000000 --- a/patch/src/main/java/diffr/patch/IllegalPatchFileException.java +++ /dev/null @@ -1,21 +0,0 @@ -package diffr.patch; - -/** - * Exception that announces an illegal patch file. - * - * @author William Martin - * @author Amaury Couste - * @since 0.3 - */ -public class IllegalPatchFileException extends Exception { - - public static final String MESSAGE = "Error. Illegal patch file."; - - public IllegalPatchFileException(final String text) { - super(text); - } - - public IllegalPatchFileException() { - super(MESSAGE); - } -} diff --git a/patch/src/main/java/diffr/patch/Main.java b/patch/src/main/java/diffr/patch/Main.java deleted file mode 100644 index e27b28d..0000000 --- a/patch/src/main/java/diffr/patch/Main.java +++ /dev/null @@ -1,118 +0,0 @@ -package diffr.patch; - -import com.google.common.base.Optional; -import com.google.common.io.Files; -import diffr.util.ArgumentsProcessor; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.List; - -/** - * Main entry point to diffr's PATCH tool. - *

- *

- * Expects two arguments: - *

    - *
  • <original-file> - The original file.
  • - *
  • <diff-file> - The diff file created using the diff tool.
  • - *
- *

- * - * @author Jakub D Kozlowski - * @author Amaury Couste - * @author William Martin - * @since 0.1 - */ -public final class Main { - - /** - * Prints the usage of this tool. - */ - private static void printUsage() { - System.out.println("Usage: \n" + - " patchr \n" + - " patchr -o "); - } - - /** - * Runs the patch tool on the original file. - * - * @param args arguments to this tool. - * - * @return exit code. - */ - public static int run(final String... args) { - - try { - if (ArgumentsProcessor.containsHelpArgument(args) - || (2 != args.length - && 4 != args.length)) { - printUsage(); - return -1; - } - - final File firstFile = new File(args[0]); - final File patchFile = new File(args[1]); - - if (!firstFile.exists()) { - System.err.println("File " + firstFile + " not found."); - return -1; - } - - if (!patchFile.exists()) { - System.err.println("File " + patchFile + " not found."); - return -1; - } - - final List firstFileStrings = Files.readLines(firstFile, Charset.defaultCharset()); - final List patchFileStrings = Files.readLines(patchFile, Charset.defaultCharset()); - - final Patchr patchr = new Patchr(firstFileStrings, patchFileStrings); - - final List newFileStrings = patchr.patch(); - - final Optional outputFile = ArgumentsProcessor.extractOutputFile(args); - - if (4 == args.length - && outputFile.isPresent()) { - - final File file = new File(outputFile.get()); - final BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file)); - for (final String line : newFileStrings) { - bufferedWriter.write(line); - bufferedWriter.write("\n"); - } - bufferedWriter.close(); - } - else { - for (final String line : newFileStrings) { - System.out.println(line); - } - System.out.flush(); - } - - return 0; - } - catch (final IOException io) { - System.err.println("There was a problem reading the files: " + io); - return -1; - } - catch (final IllegalPatchFileException ipfe) { - System.err.println("The patch file is incorrect, exiting."); - return -1; - } - } - - /** - * Invokes {@link #run(String...)} and calls {@link System#exit(int)}. - * - * @param args arguments to this tool. - */ - public static void main(String... args) { - System.exit(run(args)); - } -} diff --git a/patch/src/main/java/diffr/patch/Patchr.java b/patch/src/main/java/diffr/patch/Patchr.java deleted file mode 100644 index 16e6f4b..0000000 --- a/patch/src/main/java/diffr/patch/Patchr.java +++ /dev/null @@ -1,59 +0,0 @@ -package diffr.patch; - -import com.google.common.collect.Range; -import diffr.util.instruction.*; - -import java.util.ArrayList; -import java.util.List; - -/** - * Generates a list of Strings by applying a patch to the original file. - * - * @author Amaury Couste - * @author William Martin - * @since 0.3 - */ -public class Patchr { - - private final List originalFile; - private final List instructions; - - /** - * Default constructor. - * - * @param originalFile the original file. - * @param patchFile the patch file. - * @throws IllegalPatchFileException if there was an error reading the patch file. - */ - public Patchr(final List originalFile, final List patchFile) throws IllegalPatchFileException { - this.originalFile = originalFile; - try { - this.instructions = Instructions.readInstructions(patchFile); - } catch (final IllegalPatchInstructionException ipe) { - throw new IllegalPatchFileException("Error. Illegal patch file: " + ipe.getMessage()); - } - } - - /** - * Applies the patch file to the original file. - * - * @return a list of strings representing the patched file. - */ - public List patch() { - final List patchedFile = new ArrayList(); - for (final Instruction instruction : instructions) { - switch (instruction.getType()) { - case Copy: - final CopyInstruction copyInstruction = (CopyInstruction) instruction; - final Range range = copyInstruction.getRange(); - patchedFile.addAll(originalFile.subList(range.lowerEndpoint(), 1 + range.upperEndpoint())); - break; - case Insert: - final InsertInstruction insertInstruction = (InsertInstruction) instruction; - patchedFile.add(insertInstruction.getText()); - } - } - return patchedFile; - } - -} diff --git a/patch/src/test/java/diffr/patch/IllegalPatchFileExceptionTest.java b/patch/src/test/java/diffr/patch/IllegalPatchFileExceptionTest.java deleted file mode 100644 index 5b324db..0000000 --- a/patch/src/test/java/diffr/patch/IllegalPatchFileExceptionTest.java +++ /dev/null @@ -1,32 +0,0 @@ -package diffr.patch; - -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link diffr.patch.IllegalPatchFileException}. - * - * @author William Martin - * @since 1.0 - */ -public class IllegalPatchFileExceptionTest { - - /** - * Tests the default message. - */ - @Test - public void testDefaultMessage() { - assertThat(new IllegalPatchFileException().getMessage(), is(IllegalPatchFileException.MESSAGE)); - } - - /** - * Tests a custom message. - */ - @Test - public void testMessage() { - final String testMessage = "test message"; - assertThat(new IllegalPatchFileException(testMessage).getMessage(), is(testMessage)); - } -} diff --git a/patch/src/test/java/diffr/patch/PatchrTest.java b/patch/src/test/java/diffr/patch/PatchrTest.java deleted file mode 100644 index d87150b..0000000 --- a/patch/src/test/java/diffr/patch/PatchrTest.java +++ /dev/null @@ -1,61 +0,0 @@ -package diffr.patch; - -import com.google.common.collect.Lists; -import com.google.common.io.Files; -import com.google.common.io.Resources; -import diffr.util.instruction.IllegalPatchInstructionException; -import org.testng.annotations.Test; - -import java.io.File; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.List; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link Patchr} - */ -public class PatchrTest { - - /** - * Tests patch on ready made files. - */ - @Test - public void testExistingFiles() throws Exception { - final File originalFile = new File(Resources.getResource("files/copyright.txt").toURI()); - final File patchFile = new File(Resources.getResource("patches/cp.patch").toURI()); - final List firstFileStrings = Files.readLines(originalFile, Charset.defaultCharset()); - final List patchFileStrings = Files.readLines(patchFile, Charset.defaultCharset()); - - new Patchr(firstFileStrings, patchFileStrings).patch(); - } - - /** - * Tests whether an artificial example works. - */ - @Test - public void testPatch() throws IllegalPatchInstructionException, IllegalPatchFileException { - final List original = Lists.newArrayList("hello world!", "2", ""); - final List next = Lists.newArrayList("hello world.", "2", "3", "2", "", "hello world!"); - final List patch = Lists.newArrayList("> hello world.", "1,1", "> 3", "1,2", "0,0"); - - final Patchr patchr = new Patchr(original, patch); - final List result = patchr.patch(); - System.out.println(result.toString()); - assertThat(next.size(), is(result.size())); - for (int i = 0; i < next.size(); i++) { - assertThat(result.get(i), is(next.get(i))); - } - } - - /** - * Tests whether the Constructor throws an exception when given an illegal instruction. - */ - @Test(expectedExceptions = IllegalPatchFileException.class) - public void testIllegalInstructionConstructor() throws IllegalPatchFileException { - final List broken = Lists.newArrayList("brokenInstruction"); - new Patchr(new ArrayList(), broken); - } -} diff --git a/patch/src/test/resources/files/copyright.txt b/patch/src/test/resources/files/copyright.txt deleted file mode 100644 index ed6b0f5..0000000 --- a/patch/src/test/resources/files/copyright.txt +++ /dev/null @@ -1,42 +0,0 @@ -gcc-defaults is Copyright (C) 2000, 2001, 2006, 2009 Debian. - -These scripts are free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 2, or (at your option) any -later version. - -On Debian GNU/Linux systems, the complete text of the GNU General -Public License can be found in `/usr/share/common-licenses/GPL'. - -The c89 and c99 man pages are taken from netbsd: - -Copyright (c) 1999 The NetBSD Foundation, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. -3. All advertising materials mentioning features or use of this software - must display the following acknowledgement: - This product includes software developed by the NetBSD - Foundation, Inc. and its contributors. -4. Neither the name of The NetBSD Foundation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS -``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS -BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/patch/src/test/resources/patches/RangeTooHigh.patch b/patch/src/test/resources/patches/RangeTooHigh.patch deleted file mode 100644 index a85f578..0000000 --- a/patch/src/test/resources/patches/RangeTooHigh.patch +++ /dev/null @@ -1,3 +0,0 @@ -1,31 -> YO YO YO -32,134 \ No newline at end of file diff --git a/patch/src/test/resources/patches/cp.patch b/patch/src/test/resources/patches/cp.patch deleted file mode 100644 index 77fc94c..0000000 --- a/patch/src/test/resources/patches/cp.patch +++ /dev/null @@ -1,3 +0,0 @@ -0,30 -> YO YO YO -31,41 diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 6973f9a..0000000 --- a/pom.xml +++ /dev/null @@ -1,179 +0,0 @@ - - - 4.0.0 - - diffr - parent - 1.0-SNAPSHOT - pom - ${project.groupId}.${project.artifactId} - diffr - Intelligent DIFF/PATCH tool that knows copy and move, and has an 'r' at the end of its name. - - - - - acouste - Amaury Couste - ben@bantertrain.com - - Developer - - UCL - - - - Sarina Gurung - sarina.gurung.11@ucl.ac.uk - - Developer - - UCL - - - jkozlowski - Jakub D Kozlowski - mail@jakub-kozlowski.com - - Developer - - UCL - - - wmartin - William Martin - will.st4@gmail.com - - Developer - - UCL - - - - - - MavenCentral - Maven repository - http://repo1.maven.org/maven2 - - true - - - false - - - - Apache Snapshots - http://repository.apache.org/content/groups/snapshots/ - - true - - - - OSS Sonatype - https://oss.sonatype.org/content/repositories/releases/ - - true - - - - - - 2.3 - 0.5-rc1 - 2.4.1 - 2.5.1 - 2.3.2 - 1.0-SNAPSHOT - 1.3.9 - 11.0.1 - 1.1 - 2.3.2 - 1.6 - 5.5.1 - 4.10 - 1.0.0 - 1.9.0-rc1 - 1.4.10 - 1.6.3 - 2.8 - 6.3 - - - - assembly - diff - integration-tests - patch - util - suffix-tree - - - - - - - maven-compiler-plugin - ${compiler.version} - - ${java.version} - ${java.version} - - - - org.apache.maven.plugins - maven-clean-plugin - ${clean.version} - - - org.apache.maven.plugins - maven-surefire-plugin - ${surefire.version} - - - org.codehaus.mojo - cobertura-maven-plugin - ${cobertura.version} - - - html - xml - - - - - - - - - - all - - - - org.apache.maven.plugins - maven-surefire-plugin - - - - - - fast - - true - - - - - org.apache.maven.plugins - maven-surefire-plugin - - performance-tests - - - - - - - diff --git a/report.tex b/report.tex new file mode 100644 index 0000000..6e4d0e9 --- /dev/null +++ b/report.tex @@ -0,0 +1,77 @@ +\documentclass[10pt,a4paper]{article} +\usepackage[utf8x]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{ucs} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} +\usepackage{hyperref} +\usepackage{booktabs} +\usepackage{longtable} +\usepackage{float} +\usepackage{graphicx} +\usepackage{listings} +\lstset{language=[LaTeX]Tex,%C++, + keywordstyle=\color{RoyalBlue},%\bfseries, + basicstyle=\small\ttfamily, + %identifierstyle=\color{NavyBlue}, + commentstyle=\color{Green}\ttfamily, + stringstyle=\rmfamily, + numbers=none,%left,% + numberstyle=\scriptsize,%\tiny + stepnumber=5, + numbersep=8pt, + showstringspaces=false, + breaklines=true, + frameround=ftff, + frame=single, + belowcaptionskip=.75\baselineskip + %frame=L +} +\usepackage{cleveref} +\usepackage[square, numbers]{natbib} +\usepackage{url} +\author{Amaury Couste +\and Sarina Gurung +\and Jakub Kozlowski +\and William Martin} +\title{Tools and Environments Group Project\\diffR} +\date{April 2, 2012} +\begin{document} + +\maketitle +%\tableofcontents +\input{introduction/introduction} +\input{requirements/requirements} +\input{design/design} +\input{implementation/implementation} +\input{testing/testing} +\input{team/structure-responsibilities} +\input{results/results} +\input{conclusion/conclusion} + +\appendix +\section{System Manual} +\subsubsection*{Requirements} +To build, please install an up-to-date version of \texttt{Maven3}. + +\subsubsection*{Build} +\begin{itemize} +\item In the project root directory: \texttt{\$ mvn clean install} +\item In the assembly directory: \texttt{\$ mvn clean install assembly:single}. This will generate a \texttt{.tar.gz} and a \texttt{.zip} archives in the \texttt{target/} subdirectory, containing the \texttt{diffr} jar, and a bash script each for \texttt{diff} and \texttt{patch}. +\end{itemize} + +\subsubsection*{Running} +\paragraph{diffr to standard out} \texttt{\$ ./diffr.sh } +\paragraph{diffr to file} \texttt{\$ ./diffr.sh -o } + +\paragraph{patchr to standard out} \texttt{\$ ./patchr.sh } +\paragraph{patchr to file} \texttt{\$ ./patchr.sh -o } + +\subsubsection*{Source code} +The source code is distributed with this report. Access to the \texttt{bitbucket.org} repository is available on demand. Please note that the group did not yet agree on the licence, therefore the sourcecode should not be distributed or used for any purpose, other than for marking. + +\bibliographystyle{plainnat} +\bibliography{Bibliography} + +\end{document} diff --git a/requirements/requirements.tex b/requirements/requirements.tex new file mode 100644 index 0000000..3d1bb21 --- /dev/null +++ b/requirements/requirements.tex @@ -0,0 +1,63 @@ +\section{Requirements} + +Following is a list of functional and non-functional requirements for this tool. Each requirement listing consists of 4 columns: ID, Requirement, Priority and Risk. These columns are explained separately below. + +\paragraph{ID} +The identifier of the requirement. +Functional requirement IDs consist of an \emph{F} followed by a number, whilst non-functional requirement IDs consist of \emph{N} followed an number. + +\paragraph{Requirement} +The description of the requirement. +This column aims to concisely describe what is required, and consists of a statement about what the system shall do. + +\paragraph{Priority} +The priority of the requirement. +This column gives a qualitative judgement of the determined priority, and is ranked M for \emph{must do}, S for \emph{should do} and C for \emph{could do}. +The project's conclusion should see all requirements ranked M completed and most requirements ranked S completed. + +\paragraph{Risk} +The risk of the requirement. +This column gives a qualitative judgement of the determined risk, in terms of time, effort, effect on other requirements and likelihood of failure. +Possible ranks are High, Medium and Low with obvious definitions. + +\subsection{Functional Requirements} + +The functional requirements list particular functionalities that the tool should implement. + +\begin{center} +\begin{longtable}{c p{2.8in} c c } + + \toprule + \textbf{ID} & \multicolumn{1}{c}{\textbf{Requirement}} & \textbf{Priority} & \textbf{Risk} \\ + \midrule + + F01 & The System shall take as input two plain text files. & M & Low \\ + F02 & The System shall output plain text. & M & Low \\ + F03 & The System shall output to standard out. & M & Low \\ + F04 & The System shall output to a file & C & Medium \\ + F05 & The System shall compute the differences between the two input files. & M & Medium \\ + F06 & The System shall identify which sections are copied from the first file to the second. & M & High \\ + +\bottomrule +\end{longtable} +\end{center} + +\subsection{Non-Functional Requirements} + +The non-functional requirements describe the overall system characteristics. + +\begin{center} +\begin{longtable}{c p{2.8in} c c } + + \toprule + \textbf{ID} & \multicolumn{1}{c}{\textbf{Requirement}} & \textbf{Priority} & \textbf{Risk} \\ + \midrule + + N01 & The System shall scale well to large documents. & S & Medium \\ + N02 & The System shall process files up to $10,000$ lines long within seconds. & S & Medium \\ + N03 & The System shall be cross-platform. & C & Medium \\ + N04 & The System shall use a suffix tree in order to detect clones between two documents. & C & High \\ + +\bottomrule +\end{longtable} +\end{center} diff --git a/results/results.tex b/results/results.tex new file mode 100644 index 0000000..2641479 --- /dev/null +++ b/results/results.tex @@ -0,0 +1,77 @@ +\section{Results} + +The results achieved by our system are evaluated in this section and compared against \texttt{GNU DIFF}, already described in \Cref{Introduction}. + +\subsection{Test files} +The test files that we used for generating these results are the same as those used for integration testing (\Cref{Testing}). The lengths of the files used are as follows: + +\begin{table}[htbp] +\begin{center} +\begin{tabular}{ l l } +\textbf{File} & \textbf{Length (lines)} \\ +\cmidrule(r){1-2} +\texttt{version 2.6.27.62} & 106086 \\ +\texttt{version 3.2.13} & 177408 \\ +\end{tabular} +\end{center} +\caption{Lengths (in lines) of the test files used for comparing the performance of our tool and \texttt{GNU DIFF}.} +\end{table} + +\subsection{Experimental setup} +When measuring the performance, we perform $3$ test runs, in order to heat up disk caches and not give unfair advantage to either tool, followed by $10$ experimental runs. + +\subsection{Results of \texttt{diffr} and \texttt{patchr}} + +\subsubsection*{Time taken} +\begin{description} +\item[Command] \texttt{time ./diffr.sh kernel26.txt kernel33.txt -o kernel.patch} +\begin{table}[htbp] +\begin{center} +\begin{tabular}{ l p{3in} } +\textbf{Measurement} & \textbf{Times [s]} \\ +\cmidrule(r){1-2} +real & 1.852, 1.875, 1.806, 2.130, 1.927, 1.806, 2.135, 1.949, 2.016, 1.835 \\ +\end{tabular} +\end{center} +\caption{Experiment runs of our tool.} +\end{table} + +\end{description} + +\subsubsection*{Patch length} +\begin{description} +\item[Command] \texttt{cat kernel.patch | wc -l} +\item[Result] 101282 +\end{description} + +\subsection{Results of \texttt{GNU DIFF} } +\subsubsection*{Time taken} +\begin{description} +\item[Command] \texttt{time diff kernel26.txt kernel33.txt > kernel.patch} +\begin{table}[htbp] +\begin{center} +\begin{tabular}{ l p{3in} } +\textbf{Measurement} & \textbf{Times [s]} \\ +\cmidrule(r){1-2} +real & 0.673, 0.634, 0.693, 0.695, 0.713, 0.686, 0.677, 0.660, 0.674, 0.695 \\ +\end{tabular} +\end{center} +\caption{Experiment runs of \texttt{GNU DIFF}.} +\end{table} + +\end{description} + +\subsubsection*{Patch length} +\begin{description} +\item[Command] \texttt{cat kernel.patch | wc -l} +\item[Result] 254830 +\end{description} + +\subsection{Evaluation} +\texttt{GNU DIFF} produced patch files almost twice as large as those produced by our tool. +Our system is clearly, for this set of test files, much more efficient in terms of patch file length. + +Given that both samples are normally distributed (\texttt{Kolmogorov-Smirnov test; \texttt{diffr} $p=0.431211$; \texttt{GNU DIFF} $p=0.62061$}), we can use a parametric t-test to evaluate the performance. +This test concludes that the mean run time of our tool is significantly higher than the mean run time of GNU DIFF at 5\% confidence level ($p=5.60009\times 10^{-11}$). +Therefore, given the means ($\mbox{diffr} = 1.9331$, $\mbox{GNU DIFF}=0.68$), we can conclude that our system is slower for the set of test files used for the evaluation. +However, we believe this is due to the overhead of the Java Virtual Machine startup, which is significant for such short-running programs. diff --git a/suffix-tree/pom.xml b/suffix-tree/pom.xml deleted file mode 100644 index ca4268e..0000000 --- a/suffix-tree/pom.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - suffix-tree - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Suffix tree implementation for Diffr. - - - - diffr - util - ${current.version} - - - javolution - javolution - ${javolution.version} - - - com.google.guava - guava - ${guava.version} - - - diffr - util - ${current.version} - test - tests - - - com.google.caliper - caliper - ${caliper.version} - test - - - org.testng - testng - ${testng.version} - test - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - test - - - org.mockito - mockito-all - ${mockito.version} - test - - - diff --git a/suffix-tree/src/main/java/diffr/suffixtree/SuffixTree.java b/suffix-tree/src/main/java/diffr/suffixtree/SuffixTree.java deleted file mode 100644 index ffd619e..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/SuffixTree.java +++ /dev/null @@ -1,104 +0,0 @@ -package diffr.suffixtree; - -import com.google.common.collect.Range; - -/** - * {@link SuffixTree} is a sequence of elements structured as a tree of suffixes that allows for quick retrieval of - * sub sequences. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public interface SuffixTree { - - /** - * Interface for traversing the {@link SuffixTree} in order to match suffixes. - * - * @since 0.2 - */ - public interface Matcher { - - /** - * Tries to match the next element in the suffix from the current position in the {@link SuffixTree}. - * - * @param element next element to match. - * - * @return {@link Matched#YES} if {@code element} was matched, {@link Matched#NO} otherwise. - * - * @throws NullPointerException if {@code element} is null. - * @throws IllegalStateException if this {@link Matcher} is finished, i.e. there was a previous call to - * {@link #matchNext(Object)} that resulted in {@link Matched#NO}. - */ - Matched matchNext(final E element); - - /** - * Checks if this {@link Matcher} has already returned {@link Matched#NO}. - * - * @return {@code true} if this {@link Matcher} has not yet returned {@link Matched#NO}, - * {@code false} otherwise. - */ - boolean isFinished(); - - /** - * Checks if this {@link Matcher} is still at the root, i.e. it either did not match any elements or it - * failed to match the first element. - * - * @return {@code true} if this {@link Matcher} still points to the root of the {@link SuffixTree}, - * {@code false} otherwise. - */ - boolean isRoot(); - - /** - * Gets the index of the last matched element. If this {@link Matcher} is {@link #isFinished()}, - * then this method returns the index of the element that was matched last. Because of the way the - * {@link SuffixTree} is constructed, the values returned might not be contiguous. - * - * @return index of last matched element. - * - * @throws IllegalStateException if this {@link Matcher} did not match any elements yet, - * or it failed to match the first attempt, i.e. the call to {@link #matchNext - * (Object)} failed. - */ - int lastIndex(); - - /** - * Gets a continuous range of elements that were matched so far. If {@link Matcher#isFinished} is {@code - * false}, the range returned refers to the range of elements matched up to the point that {@link #matchNext - * (Object)} returned {@link Matched#NO}. - * - * @return range of elements that were matched so far. - * - * @throws IllegalStateException if this {@link Matcher} did not match any elements yet, or it failed to match - * the first attempt, i.e. the call to {@link #matchNext(Object)} failed. - */ - Range range(); - } - - /** - * Indicates whether the element was matched. - * - * @since 0.2 - */ - public enum Matched { - - YES, NO; - - /** - * Checks if the element was matched. - * - * @return {@code true} if the element was matched, {@code false} otherwise. - */ - public boolean isMatched() { - return this == YES; - } - } - - /** - * Gets the {@link Matcher} for this {@link SuffixTree}. - * - * @return matcher implementation for this {@link SuffixTree}. - * - * @since 0.2 - */ - Matcher matcher(); -} diff --git a/suffix-tree/src/main/java/diffr/suffixtree/SuffixTrees.java b/suffix-tree/src/main/java/diffr/suffixtree/SuffixTrees.java deleted file mode 100644 index 45c6caa..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/SuffixTrees.java +++ /dev/null @@ -1,38 +0,0 @@ -package diffr.suffixtree; - -import diffr.suffixtree.impl.SuffixTreeImpl; - -import java.util.List; - -/** - * Factory method for creating {@link SuffixTree}s. - * - * @author Jakub D Kozlowski - * @since 0.2 - */ -public final class SuffixTrees { - - private static final String ERROR_MSG = "This class should not be instantiated"; - - /** - * This class should not be instantiated. - * - * @throws UnsupportedOperationException this class should not be instantiated. - */ - public SuffixTrees() { - throw new UnsupportedOperationException(ERROR_MSG); - } - - /** - * Gets a {@link SuffixTree} for these {@code elements}, optimised for checking for existence of suffixes in - * {@code elements}. - * - * @param elements elements to build the {@link SuffixTree} for. - * @param type of elements. - * - * @return {@link SuffixTree} for these {@code elements}. - */ - public static SuffixTree newSuffixTree(final List elements) { - return SuffixTreeImpl.newSuffixTree(elements); - } -} diff --git a/suffix-tree/src/main/java/diffr/suffixtree/impl/Edge.java b/suffix-tree/src/main/java/diffr/suffixtree/impl/Edge.java deleted file mode 100644 index f110f01..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/impl/Edge.java +++ /dev/null @@ -1,142 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.base.Objects; -import com.google.common.base.Objects.ToStringHelper; -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; -import diffr.util.ListIterators; -import javolution.util.FastCollection.Record; - -import java.util.ListIterator; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Represents an edge between a parent and child nodes, that points to a certain {@link Range} of elements: - * the ranges are inclusive on both sides, e.g. {@code [1, 3]} indicates that the {@link Edge} refers to elements at - * indexes {@code 1, 2, 3}, conversely {@code [1, 1]} indicates that the {@link Edge} refers to just element at index - * {@code 1}. Ranges can also be of type {@code [a,+∞]}, which indicates that the edge refers to all the elements - * from index {@code a} onwards. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public final class Edge { - - private final Record parent; - - private final Record child; - - private final Range range; - - /** - * Default constructor. - * - * @param parent parent of this {@link Edge}. - * @param child child of this {@link Edge}. - * @param range range of this {@link Edge}. - * - * @throws NullPointerException if any parameter is null. - * @throws IllegalArgumentException if {@link Range#hasLowerBound()} on {@code range} is false, - * {@code parent} and {@code child} are the same or the lower endpoint - * of the {@code range} is negative. - */ - public Edge(final Record parent, final Record child, final Range range) { - this.parent = checkNotNull(parent); - this.child = checkNotNull(child); - checkArgument(!parent.equals(child)); - checkNotNull(range); - checkArgument(range.hasLowerBound()); - checkArgument(range.lowerEndpoint() >= 0); - this.range = range; - } - - /** - * Gets the parent that this edge links to. - * - * @return the parent. - */ - public Record getParent() { - return parent; - } - - /** - * Gets the child that this edge links to. - * - * @return the child. - */ - public Record getChild() { - return child; - } - - /** - * Gets the range of elements that this node holds. - * - * @return range of elements. - */ - public Range getRange() { - return range; - } - - /** - * Gets an iterator for the elements pointed to by this {@link Edge}. - * - * @param suffixTree parent of this {@link Edge}. - * @param type of the elements. - * - * @return iterator for the elements of this {@link Edge}. - * - * @throws NullPointerException if {@code suffixTree} is null. - * @throws IllegalArgumentException if this edge has an upper boundary and the upper boundary exceeds the size of - * the list of elements of this {@code suffixTree}. - */ - public ListIterator iterator(final SuffixTreeImpl suffixTree) { - checkNotNull(suffixTree); - checkArgument(!range.hasUpperBound() || range.upperEndpoint() < suffixTree.getElements().size()); - return range.hasUpperBound() ? - ListIterators.limit(suffixTree.elementsListIterator(range.lowerEndpoint()), - range.upperEndpoint() - range.lowerEndpoint() + 1) : - suffixTree.elementsListIterator(range.lowerEndpoint()); - } - - /** - * Gets an edge between {@code newParent} and {@code newChild} that starts at {@code newStart} and has the same - * upper bound as {@code oldRange}. - * - * @param newParent parent for the edge. - * @param newChild child for the edge. - * @param oldRange range which upper bound will be the upper bound of the new edge. - * @param newStart start of the edge. - * - * @return edge between {@code newParent} and {@code newChild} that starts at {@code newStart} and has the same - * upper bound as {@code oldRange}. - * - * @throws NullPointerException if any parameter is null. - * @throws IllegalArgumentException if {@code newStart} is negative or it is greater than the upper bound of - * {@code oldRange}. - */ - public static Edge newStartEdge(final Record newParent, final Record newChild, final Range oldRange, - final int newStart) { - - checkNotNull(newParent); - checkNotNull(newChild); - checkNotNull(oldRange); - checkArgument(newStart >= 0); - - return oldRange.hasUpperBound() ? - new Edge(newParent, newChild, Ranges.closed(newStart, oldRange.upperEndpoint())) : - new Edge(newParent, newChild, Ranges.atLeast(newStart)); - } - - /** - * {@inheritDoc} - */ - @Override - public String toString() { - final ToStringHelper toString = Objects.toStringHelper(this); - toString.add("parent", parent); - toString.add("child", child).add("range", range); - return toString.toString(); - } -} diff --git a/suffix-tree/src/main/java/diffr/suffixtree/impl/MatcherImpl.java b/suffix-tree/src/main/java/diffr/suffixtree/impl/MatcherImpl.java deleted file mode 100644 index fff421f..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/impl/MatcherImpl.java +++ /dev/null @@ -1,196 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.base.Optional; -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; -import diffr.suffixtree.SuffixTree; -import diffr.suffixtree.SuffixTree.Matched; -import diffr.suffixtree.SuffixTree.Matcher; -import javolution.util.FastCollection.Record; - -import java.util.ListIterator; - -import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Preconditions.checkState; - -/** - * {@link Matcher} implementation for {@link SuffixTreeImpl}. - * - * @author Jakub D Kozlowski - * @since 0.2 - */ -public class MatcherImpl implements Matcher { - - private final SuffixTreeImpl suffixTree; - - private Optional curEdge; - - private Optional> curEdgeIterator; - - private boolean finished; - - private int matched; - - /** - * Default constructor. - * - * @param suffixTree {@link SuffixTree} to traverse. - * - * @throws NullPointerException if {@code suffixTree} is null. - */ - public MatcherImpl(final SuffixTreeImpl suffixTree) { - this.suffixTree = checkNotNull(suffixTree); - this.curEdge = Optional.absent(); - this.curEdgeIterator = Optional.absent(); - this.finished = false; - this.matched = 0; - } - - /** - * {@inheritDoc} - */ - @Override - public Matched matchNext(final E element) { - - checkNotNull(element); - checkState(!finished); - - // We are at root - if (!curEdge.isPresent()) { - final Optional edgeLookup = suffixTree.getEdge(suffixTree.getRoot(), element); - if (!edgeLookup.isPresent()) { - return notMatched(); - } - - curEdge = edgeLookup; - curEdgeIterator = Optional.of(curEdge.get().iterator(suffixTree)); - curEdgeIterator.get().next(); - matched++; - return Matched.YES; - } - - checkStarted(); - if (!curEdgeIterator.get().hasNext()) { - - final Optional edgeLookup = suffixTree.getEdge(curEdge.get().getChild(), element); - if (!edgeLookup.isPresent()) { - return notMatched(); - } - - curEdge = edgeLookup; - curEdgeIterator = Optional.of(curEdge.get().iterator(suffixTree)); - curEdgeIterator.get().next(); - matched++; - return Matched.YES; - } - - final E edgeElement = curEdgeIterator.get().next(); - if (edgeElement.hashCode() == element.hashCode() && edgeElement.equals(element)) { - matched++; - return Matched.YES; - } - - curEdgeIterator.get().previous(); - return notMatched(); - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isFinished() { - return finished; - } - - /** - * {@inheritDoc} - */ - @Override - public boolean isRoot() { - return !curEdge.isPresent(); - } - - /** - * {@inheritDoc} - */ - @Override - public int lastIndex() { - checkStarted(); - checkState(curEdge.isPresent()); - return curEdgeIterator.get().previousIndex(); - } - - /** - * {@inheritDoc} - */ - @Override - public Range range() { - checkStarted(); - checkState(curEdge.isPresent()); - return Ranges.closed(curEdgeIterator.get().previousIndex() - matched + 1, - curEdgeIterator.get().previousIndex()); - } - - /** - * Checks if the current edge has next element. - * - * @return {@code true} if the current edge has next element. - */ - public boolean edgeHasNext() { - checkStarted(); - return curEdgeIterator.isPresent() ? curEdgeIterator.get().hasNext() : false; - } - - /** - * Gets the last edge matched. - * - * @return last matched edge. - * - * @throws IllegalStateException if this {@link MatcherImpl} has not yet matched any elements. - */ - public Edge lastEdge() { - checkStarted(); - if (curEdge.isPresent()) { - return curEdge.get(); - } - throw new IllegalStateException(); - } - - /** - * Gets the last matched node. - * - * @return last matched node or root node if the attempt to match at root was unsuccessful. - * - * @throws IllegalStateException if this {@link MatcherImpl} has not yet matched any elements. - */ - public Record lastNode() { - checkStarted(); - if (curEdge.isPresent()) { - return curEdge.get().getChild(); - } - return suffixTree.getRoot(); - } - - /** - * Checks if this {@link MatcherImpl} has already matched an element. - * - * @throws IllegalStateException if this {@link MatcherImpl} has not yet matched any elements. - */ - private void checkStarted() { - - if (!finished) { - checkState(curEdge.isPresent()); - checkState(curEdgeIterator.isPresent()); - } - } - - /** - * Sets {@code finished} to {@code true} and returns {@link Matched#NO}. - * - * @return {@link Matched#NO}. - */ - private Matched notMatched() { - finished = true; - return Matched.NO; - } -} diff --git a/suffix-tree/src/main/java/diffr/suffixtree/impl/NodeKey.java b/suffix-tree/src/main/java/diffr/suffixtree/impl/NodeKey.java deleted file mode 100644 index 084bd02..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/impl/NodeKey.java +++ /dev/null @@ -1,98 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.base.Objects; -import diffr.suffixtree.SuffixTree; -import javolution.util.FastCollection.Record; - -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Represents a {@code [parent, element]} pair that can be used as a key to a map of {@link Edge}s. - * - * @param type of element this {@link NodeKey} points to. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public final class NodeKey { - - private final Record parent; - - private final E element; - - private final int hashCode; - - /** - * Default constructor. - * - * @param parent parent that this {@link NodeKey} points to. - * @param element element for this {@link NodeKey}. - * - * @throws NullPointerException if any parameter is null. - */ - private NodeKey(final Record parent, final E element) { - this.parent = checkNotNull(parent); - this.element = checkNotNull(element); - this.hashCode = Objects.hashCode(parent, element); - } - - /** - * {@inheritDoc} - */ - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || this.getClass() != o.getClass()) return false; - - final NodeKey that = (NodeKey) o; - - if (hashCode != that.hashCode) return false; - if (!element.equals(that.element)) return false; - if (parent != that.parent) return false; - - return true; - } - - /** - * {@inheritDoc} - */ - @Override - public int hashCode() { - return hashCode; - } - - /** - * Gets an instance of {@link NodeKey} for this {@code parent} and {@code element.} - * - * @param parent parent of this {@link NodeKey}. - * @param element element of this {@link NodeKey}. - * @param type of this {@link NodeKey}. - * - * @return new instance of {@link NodeKey}. - * - * @throws NullPointerException if any parameter is null. - */ - public static NodeKey lookup(final Record parent, final E element) { - return new NodeKey(parent, element); - } - - /** - * Gets an instance of {@link NodeKey} for the parent of this {@code edge} and the element pointed to by the - * lower boundary of this {@code edge}. - * - * @param edge edge for the parent of which this {@link NodeKey} will be created. - * @param suffixTree parent {@link SuffixTree} to lookup the element. - * @param type of the {@link SuffixTree}. - * - * @return instance of {@link NodeKey} for this {@code edge}. - * - * @throws NullPointerException if any parameter is null. - * @throws IndexOutOfBoundsException if lower boundary of this {@code edge} is out of range, - * of this {@code suffixTree}, i.e. {@code index < 0 || index >= size())}. - */ - public static > NodeKey newNodeKey(final Edge edge, - final SuffixTreeImpl suffixTree) { - return new NodeKey(checkNotNull(edge).getParent(), - checkNotNull(suffixTree.getElement(edge.getRange().lowerEndpoint()))); - } -} diff --git a/suffix-tree/src/main/java/diffr/suffixtree/impl/SuffixTreeImpl.java b/suffix-tree/src/main/java/diffr/suffixtree/impl/SuffixTreeImpl.java deleted file mode 100644 index 557fa63..0000000 --- a/suffix-tree/src/main/java/diffr/suffixtree/impl/SuffixTreeImpl.java +++ /dev/null @@ -1,238 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; -import diffr.suffixtree.SuffixTree; -import javolution.util.FastCollection.Record; -import javolution.util.FastMap; -import javolution.util.FastTable; -import javolution.util.Index; - -import java.util.List; -import java.util.ListIterator; -import java.util.Map; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Default {@link SuffixTreeImpl} implementation. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public final class SuffixTreeImpl implements SuffixTree { - - private final FastTable elements; - - private final Map, Edge> edges; - - private final Record root; - - private Record curRecord; - - /** - * Default constructor. - * - * @param elements elements that this {@link SuffixTreeImpl} will be built for. - * - * @throws NullPointerException if {@code elements} is null. - */ - private SuffixTreeImpl(final List elements) { - checkNotNull(elements); - this.elements = FastTable.newInstance(); - this.elements.addAll(elements); - this.curRecord = Index.ZERO.getNext(); - this.root = this.curRecord; - this.edges = FastMap.newInstance(); - } - - /** - * Adds a child to the {@code parent}. - * - * @param range range of the edge that will be created. - * - * @return the new node created. - * - * @throws NullPointerException if any parameter is null. - * @throws IllegalArgumentException if {@code range} does not have a lower bound. - * @since 0.2 - */ - private Record addChild(final Record parent, final Range range) { - checkNotNull(parent); - checkNotNull(range); - checkArgument(range.hasLowerBound()); - this.curRecord = curRecord.getNext(); - final Edge newEdge = new Edge(parent, curRecord, range); - addEdge(newEdge); - return curRecord; - } - - /** - * Removes this {@code oldEdge} and instead creates: - * - *
    - *
  • a {@code branchNode} with a {@code branchEdge} {@code [oldEdge.parentNode, branchNode, - * [oldNode.range.lowerBound(), lastMatched]}
  • - *
  • an {@code leftEdge} {@code [branchNode, oldEdge.childNode, [lastMatched, oldEdge.range.upperBound]]}
  • - *
  • a {@code splitNode} with a {@code splitEdge} {@code [branchNode, splitNode, [firstNotMatchedSuffixIndex, - * +∞]}
  • - *
- * - * @param oldEdge edge that will be removed. - * @param lastMatchedEdgeIndex index of the last element matched in the edge. - * @param firstNotMatchedSuffixIndex index of the first element not matched from the suffix. - * - * @throws NullPointerException if {@code oldEdge} is null or it does not exists in this {@link SuffixTreeImpl}. - * @since 0.2 - */ - private void splitEdge(final Edge oldEdge, final int lastMatchedEdgeIndex, final int firstNotMatchedSuffixIndex) { - - checkNotNull(edges.remove(NodeKey.newNodeKey(checkNotNull(oldEdge), this))); - - // {@code [oldEdge.parentNode, branch, [oldNode.range.lowerBound(), lastMatched]} - final Record branch - = addChild(oldEdge.getParent(), - Ranges.closed(oldEdge.getRange().lowerEndpoint(), lastMatchedEdgeIndex)); - - // {@code [branch, oldEdge.childNode, [lastMatchedEdgeIndex + 1, oldEdge.range.lowerBound]]} - final Edge leftEdge = Edge.newStartEdge(branch, oldEdge.getChild(), oldEdge.getRange(), - lastMatchedEdgeIndex + 1); - addEdge(leftEdge); - - // {@code [branch, splitNode, [firstNotMatchedSuffixIndex, +∞]} - addChild(branch, Ranges.atLeast(firstNotMatchedSuffixIndex)); - } - - /** - * Adds this {@code edge}. No check is performed whether the {@code parentNode} and {@code childNode} is present - * in this {@link SuffixTreeImpl}. - * - * @param edge new edge. - * - * @throws NullPointerException if {@code edge} is null. - * @throws IllegalArgumentException if this {@link SuffixTreeImpl} already contains this {@code edge}. - * @since 0.2 - */ - @VisibleForTesting - void addEdge(final Edge edge) { - final NodeKey nodeKey = NodeKey.newNodeKey(checkNotNull(edge), this); - checkArgument(!edges.containsKey(nodeKey)); - edges.put(nodeKey, edge); - } - - /** - * Gets the edge from this {@code parent} to this {@code element}. - * - * @param parent parent of the {@link Edge} to get. - * @param element the first element of the {@code Edge} to get. - * - * @return edge from this {@code parent} to this {@code element}. - * - * @throws NullPointerException if any parameter is null. - */ - @VisibleForTesting - Optional getEdge(final Record parent, final E element) { - return Optional.fromNullable(edges.get(NodeKey.lookup(checkNotNull(parent), checkNotNull(element)))); - } - - /** - * Gets the root node of this {@link SuffixTree}. - * - * @return root of this {@link SuffixTree}. - */ - @VisibleForTesting - Record getRoot() { - return root; - } - - /** - * Gets a {@link ListIterator} of elements in this {@link SuffixTreeImpl}, starting from {@code index}. The - * returned iterator supports all the mutation operations like {@link ListIterator#remove()} or {@link - * ListIterator#set(Object)}, however it should only be used to retrieve elements. - * - * @param index starting index for this {@link ListIterator}. - * - * @return {@link ListIterator} of elements in this {@link SuffixTreeImpl}, starting from {@code index}. - * - * @since 0.2 - */ - ListIterator elementsListIterator(final int index) { - return elements.listIterator(index); - } - - /** - * Gets the element at this {@code element}. - * - * @param index index of the element. - * - * @return element at this {@code element}. - */ - E getElement(int index) { - return elements.get(index); - } - - /** - * Returns a read-only view of elements in this {@link SuffixTree}. - * - * @return read-only view of elements in this {@link SuffixTree}. - */ - List getElements() { - return elements.unmodifiable(); - } - - /** - * {@inheritDoc} - */ - @Override - public MatcherImpl matcher() { - return new MatcherImpl(this); - } - - /** - * Factory for {@link SuffixTreeImpl}s. - * - * @param elements elements to build the {@link SuffixTreeImpl} for. - * - * @return new {@link SuffixTreeImpl} for {@code elements}. - * - * @throws NullPointerException if {@code elements} is null. - */ - public static > SuffixTreeImpl newSuffixTree(final List elements) { - - checkNotNull(elements); - - final SuffixTreeImpl suffixTree = new SuffixTreeImpl(elements); - - final ListIterator suffixes = elements.listIterator(); - - while (suffixes.hasNext()) { - - suffixes.next(); - - final MatcherImpl suffixMatcher = suffixTree.matcher(); - final ListIterator suffixIterator = elements.listIterator(suffixes.previousIndex()); - - while (suffixIterator.hasNext()) { - - if (!suffixMatcher.matchNext(suffixIterator.next()).isMatched()) { - if (suffixMatcher.edgeHasNext()) { - suffixTree.splitEdge(suffixMatcher.lastEdge(), - suffixMatcher.lastIndex(), - suffixIterator.previousIndex()); - } - else { - suffixTree.addChild(suffixMatcher.lastNode(), - Ranges.atLeast(suffixIterator.previousIndex())); - - } - break; - } - } - } - - return suffixTree; - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/SuffixTreesTest.java b/suffix-tree/src/test/java/diffr/suffixtree/SuffixTreesTest.java deleted file mode 100644 index 7c9d2ba..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/SuffixTreesTest.java +++ /dev/null @@ -1,33 +0,0 @@ -package diffr.suffixtree; - -import com.google.common.collect.Lists; -import diffr.suffixtree.SuffixTrees; -import diffr.suffixtree.impl.SuffixTreeImpl; -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link SuffixTrees}. - * - * @author Jakub D Kozlowski - * @since 0.2 - */ -public class SuffixTreesTest { - - @Test(expectedExceptions = UnsupportedOperationException.class) - public void testConstructor() { - new SuffixTrees(); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewSuffixTreeNullElements() { - SuffixTrees.newSuffixTree(null); - } - - @Test - public void testNewSuffixTree() { - assertThat(SuffixTrees.newSuffixTree(Lists.charactersOf("bla")), is(SuffixTreeImpl.class)); - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/EdgeTest.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/EdgeTest.java deleted file mode 100644 index 033cb2c..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/EdgeTest.java +++ /dev/null @@ -1,159 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; -import javolution.util.FastCollection.Record; -import javolution.util.Index; -import org.testng.annotations.Test; - -import java.util.Iterator; -import java.util.List; -import java.util.ListIterator; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.hasToString; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link Edge}. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public class EdgeTest { - - private static final Record parent = Index.ZERO.getNext(); - - private static final Record child = parent.getNext(); - - private static final Range range = Ranges.atLeast(0); - - private static final Edge edge = new Edge(parent, child, range); - - @Test(expectedExceptions = NullPointerException.class) - public void testContructorNullParent() { - new Edge(null, child, range); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testContructorNullChild() { - new Edge(parent, null, range); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testConstructorNullRange() { - new Edge(parent, child, null); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testConstructorRangeWithoutLowerBound() { - new Edge(parent, child, Ranges.atMost(2)); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testConstructorRangeLowerBoundNegative() { - new Edge(parent, child, Ranges.atMost(-1)); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testConstructorParentChildSame() { - new Edge(parent, parent, range); - } - - @Test - public void testConstructor() { - final Edge edge = new Edge(parent, child, range); - assertThat(edge.getParent(), is(parent)); - assertThat(edge.getChild(), is(child)); - assertThat(edge.getRange(), is(range)); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testIteratorNullSuffixTree() { - edge.iterator(null); - } - - @Test - public void testIteratorEdgeNoUpperBoundary() { - - final Edge edge = new Edge(parent, child, Ranges.atLeast(1)); - final SuffixTreeImpl suffixTree = SuffixTreeImpl.newSuffixTree(Lists.newArrayList(1, 2, 3, 4, 5)); - - final ListIterator edgeIterator = edge.iterator(suffixTree); - final ListIterator elementsIterator = suffixTree.elementsListIterator(1); - assertThat(Iterators.elementsEqual(edgeIterator, elementsIterator), is(true)); - } - - @Test - public void testIteratorEdgeUpperBoundary() { - - final List elements = Lists.newArrayList(1, 2, 3, 4, 5, 6); - final SuffixTreeImpl suffixTree = SuffixTreeImpl.newSuffixTree(elements); - - for (int i = 0; i < elements.size(); i++) { - final Edge edge = new Edge(parent, child, Ranges.closed(0, i)); - final Iterator iterator = edge.iterator(suffixTree); - assertThat(Iterators.elementsEqual(iterator, elements.subList(0, i + 1).iterator()), is(true)); - - final Edge edge2 = new Edge(parent, child, Ranges.closed(i, i)); - final Iterator iterator2 = edge2.iterator(suffixTree); - assertThat(Iterators.elementsEqual(iterator2, elements.subList(i, i + 1).iterator()), is(true)); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testIteratorEdgeNotFromSuffixTree() { - - final List elements = Lists.newArrayList(1, 2, 3, 4, 5, 6); - final SuffixTreeImpl suffixTree = SuffixTreeImpl.newSuffixTree(elements); - final Edge edge = new Edge(parent, child, Ranges.closed(1, 6)); - - edge.iterator(suffixTree); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewStartEdgeNullParent() { - Edge.newStartEdge(null, child, range, 1); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewStartEdgeNullChild() { - Edge.newStartEdge(parent, null, range, 1); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewStartEdgeNullRange() { - Edge.newStartEdge(parent, child, null, 1); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testNewStartEdgeNewStartNegative() { - Edge.newStartEdge(parent, child, range, -1); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testNewStartEdgeNewStartGreaterThanUpperBound() { - Edge.newStartEdge(parent, child, Ranges.closed(0, 1), 2); - } - - @Test - public void testNewStartEdgeNoUpperBound() { - assertThat(Edge.newStartEdge(parent, child, range, 1).getRange().hasUpperBound(), is(false)); - } - - @Test - public void testNewStartEdgeHasUpperBound() { - assertThat(Edge.newStartEdge(parent, child, Ranges.closed(1, 3), 2).getRange().upperEndpoint(), - is(3)); - } - - @Test - public void testToString() { - assertThat(edge, - hasToString(equalTo("Edge{" + "parent=" + parent + ", " + - "child=" + child + ", range=" + range + "}"))); - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/MatcherImplTest.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/MatcherImplTest.java deleted file mode 100644 index f226df3..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/MatcherImplTest.java +++ /dev/null @@ -1,132 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.collect.Lists; -import com.google.common.collect.Range; -import diffr.suffixtree.SuffixTree; -import diffr.suffixtree.SuffixTree.Matched; -import diffr.suffixtree.SuffixTree.Matcher; -import diffr.suffixtree.SuffixTrees; -import org.testng.annotations.Test; - -import java.util.ListIterator; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link MatcherImpl}. - * - * @author Jakub D Kozlowski - * @since 0.2 - */ -public class MatcherImplTest { - - @Test(expectedExceptions = NullPointerException.class) - public void testConstructorNullSuffixTree() { - new MatcherImpl(null); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testMatchNextNullElement() { - new MatcherImpl(SuffixTreeImpl.newSuffixTree(Lists.newArrayList(1, 2))).matchNext(null); - } - - @Test - public void testMatchNextElementMatches() { - - final SuffixTree suffixTree = SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")); - final Matcher matcher = suffixTree.matcher(); - - for (Character c : Lists.charactersOf("issippi")) { - assertThat(matcher.matchNext(c), is(Matched.YES)); - } - - assertThat(matcher.isFinished(), is(false)); - } - - @Test - public void testMatchNextElementDoesNotMatch() { - - final SuffixTree suffixTree = SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")); - final Matcher matcher = suffixTree.matcher(); - - for (Character c : Lists.charactersOf("issipp")) { - assertThat(matcher.matchNext(c), is(Matched.YES)); - } - - assertThat(matcher.matchNext('p'), is(Matched.NO)); - assertThat(matcher.isFinished(), is(true)); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testMatchNextElementMatcherFinished() { - final SuffixTree suffixTree = SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")); - final Matcher matcher = suffixTree.matcher(); - - for (Character c : Lists.charactersOf("issipp")) { - assertThat(matcher.matchNext(c), is(Matched.YES)); - } - - assertThat(matcher.matchNext('p'), is(Matched.NO)); - assertThat(matcher.isFinished(), is(true)); - matcher.matchNext('b'); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testLastIndexNotStarted() { - SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")).matcher().lastIndex(); - } - - @Test - public void testIsRootNotStarted() { - assertThat(SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")).matcher().isRoot(), is(true)); - } - - @Test - public void testIsRootFailedToMatchFirstElement() { - final Matcher matcher = SuffixTrees.newSuffixTree(Lists.charactersOf("mississippi")).matcher(); - matcher.matchNext('k'); - assertThat(matcher.isRoot(), is(true)); - } - - @Test - public void testIsRootMatcherNotAtRoot() { - final Matcher matcher = SuffixTrees.newSuffixTree(Lists.charactersOf("mississippi")).matcher(); - matcher.matchNext('i'); - assertThat(matcher.isRoot(), is(false)); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testGetRangeNotStarted() { - SuffixTrees.newSuffixTree(Lists.charactersOf("mississippi")).matcher().range(); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testGetRangeFailedToMatchFirstElement() { - final Matcher matcher = SuffixTrees.newSuffixTree(Lists.charactersOf("mississippi")).matcher(); - matcher.matchNext('k'); - matcher.range(); - } - - @Test(dataProviderClass = SuffixTreeImplTestUtils.class, - dataProvider = SuffixTreeImplTestUtils.STRING_DATA_PROVIDER) - public void testGetRange(final String testString) { - final SuffixTreeImpl suffixTree = SuffixTreeImpl.newSuffixTree(Lists.charactersOf(testString)); - for (int suffixIndex = 0; suffixIndex < suffixTree.getElements().size(); suffixIndex++) { - - final ListIterator suffixIterator = suffixTree.elementsListIterator(suffixIndex); - final Matcher suffixMatcher = suffixTree.matcher(); - final StringBuilder b = new StringBuilder(); - - while (suffixIterator.hasNext()) { - final Character c = suffixIterator.next(); - b.append(c); - assertThat(suffixMatcher.matchNext(c), is(Matched.YES)); - final Range matchedRange = suffixMatcher.range(); - - assertThat(testString.indexOf(b.toString()), is(matchedRange.lowerEndpoint())); - assertThat(testString.indexOf(b.toString()) + b.length() - 1, is(matchedRange.upperEndpoint())); - } - } - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/NodeKeyTest.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/NodeKeyTest.java deleted file mode 100644 index f9ffe85..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/NodeKeyTest.java +++ /dev/null @@ -1,48 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.collect.Lists; -import com.google.common.collect.Ranges; -import javolution.util.FastCollection.Record; -import javolution.util.Index; -import org.testng.annotations.Test; - -/** - * Tests {@link NodeKey}. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public class NodeKeyTest { - - private static final Record parent = Index.ZERO.getNext(); - - private static final Record child = parent.getNext(); - - private final SuffixTreeImpl suffixTree = SuffixTreeImpl.newSuffixTree(Lists.newArrayList(1,2,3)); - - @Test(expectedExceptions = NullPointerException.class) - public void testLookupNullParent() { - NodeKey.lookup(null, 1); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testLookupNullElement() { - NodeKey.lookup(parent, null); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewNodeKeyNullEdge() { - NodeKey.newNodeKey(null, suffixTree); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testNewNodeKeyNullSuffixTree() { - NodeKey.newNodeKey(new Edge(parent, child, Ranges.atLeast(1)), - (SuffixTreeImpl) null); - } - - @Test(expectedExceptions = IndexOutOfBoundsException.class) - public void testNewNodeIndexOutOfBound() { - NodeKey.newNodeKey(new Edge(parent, child, Ranges.atLeast(3)), suffixTree); - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplPerfTest.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplPerfTest.java deleted file mode 100644 index 6ac28f5..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplPerfTest.java +++ /dev/null @@ -1,69 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.caliper.Param; -import com.google.caliper.Runner; -import com.google.caliper.SimpleBenchmark; -import diffr.suffixtree.SuffixTrees; -import diffr.util.RandomFiles; -import javolution.text.Text; -import org.testng.annotations.Test; - -import java.util.List; - -import static diffr.suffixtree.impl.SuffixTreeImplTestUtils.validateSuffixTree; - -/** - * Performance tests for {@link SuffixTreeImpl}. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -@Test(groups = "performance-tests") -public class SuffixTreeImplPerfTest { - - @Test - public void testPerformance() { - Runner.main(SuffixTreeImplBenchmark.class, new String[]{"--trials", "2", "-DfileLength=100,1000,10000"}); - } - - /** - * Benchmarks various operations on {@link SuffixTreeImpl}. - */ - public static class SuffixTreeImplBenchmark extends SimpleBenchmark { - - List lines; - - SuffixTreeImpl suffixTree; - - @Param - int fileLength; - - @Override - protected void setUp() { - lines = RandomFiles.getRandomFile(fileLength, 1341376661698861013L + fileLength); - suffixTree = SuffixTreeImpl.newSuffixTree(lines); - } - - /** - * Build the {@link SuffixTreeImpl} {@code reps} times. - * - * @param reps number of repetitions. - */ - public void timeBuildSuffixTreeImpl(int reps) { - for (int i = 0; i < reps; i++) { - SuffixTrees.newSuffixTree(lines); - } - } - - /** - * Build the {@link SuffixTreeImpl} and validates it {@code reps} times. - * - * @param reps number of repetitions. - */ - public void timeVerifySuffixTreeImpl(int reps) { - for (int i = 0; i < reps; i++) { - validateSuffixTree(suffixTree); - } - } - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTest.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTest.java deleted file mode 100644 index 354b3d2..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTest.java +++ /dev/null @@ -1,43 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.collect.Lists; -import javolution.text.Text; -import org.testng.annotations.Test; - -import java.util.List; - -import static diffr.suffixtree.impl.SuffixTreeImplTestUtils.validateSuffixTree; - -/** - * Tests {@link SuffixTreeImpl}. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public class SuffixTreeImplTest { - - - @Test(dataProviderClass = SuffixTreeImplTestUtils.class, - dataProvider = SuffixTreeImplTestUtils.STRING_DATA_PROVIDER) - public void testNewSuffixTreeStrings(final String testString) { - validateSuffixTree(SuffixTreeImpl.newSuffixTree(Lists.charactersOf(testString))); - } - - @Test(dataProviderClass = SuffixTreeImplTestUtils.class, - dataProvider = SuffixTreeImplTestUtils.FILE_DATA_PROVIDER) - public void testNewSuffixTreeFiles(final List testFile) { - validateSuffixTree(SuffixTreeImpl.newSuffixTree(testFile)); - } - - @Test(expectedExceptions = NullPointerException.class) - public void testAddEdgeNullEdge() { - SuffixTreeImpl.newSuffixTree(Lists.charactersOf("123")).addEdge(null); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testAddEdgeContainsEdge() { - final SuffixTreeImpl suffixTree - = SuffixTreeImpl.newSuffixTree(Lists.charactersOf("123")); - suffixTree.addEdge(suffixTree.getEdge(suffixTree.getRoot(), '1').get()); - } -} diff --git a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTestUtils.java b/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTestUtils.java deleted file mode 100644 index d16c504..0000000 --- a/suffix-tree/src/test/java/diffr/suffixtree/impl/SuffixTreeImplTestUtils.java +++ /dev/null @@ -1,96 +0,0 @@ -package diffr.suffixtree.impl; - -import com.google.common.collect.Lists; -import diffr.suffixtree.SuffixTree; -import diffr.suffixtree.SuffixTree.Matched; -import diffr.suffixtree.SuffixTree.Matcher; -import diffr.util.RandomFiles; -import javolution.text.Text; -import org.testng.annotations.DataProvider; - -import java.util.List; -import java.util.ListIterator; -import java.util.Random; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Utilities for testing {@link SuffixTree}s. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public class SuffixTreeImplTestUtils { - - public static final String STRING_DATA_PROVIDER = "string-data-provider"; - - public static final String FILE_DATA_PROVIDER = "file-data-provider"; - - /** - * Gets the test strings. - * - * @return the test strings. - */ - @DataProvider(name = STRING_DATA_PROVIDER, parallel = true) - public static String[][] getStrings() { - final List testStrings = Lists.newArrayList(); - - testStrings.add(new String[]{"doodahde"}); - testStrings.add(new String[]{"ababa"}); - testStrings.add(new String[]{"xabxa"}); - testStrings.add(new String[]{"bananas"}); - testStrings.add(new String[]{"bookkeeper"}); - testStrings.add(new String[]{"mississippi"}); - testStrings.add(new String[]{"misrsissippi"}); - testStrings.add(new String[]{"I at once heard the rickety crickety creaking of the bridge."}); - - final Random random = new Random(1341376661705837014L); - - for (int i = 0; i < 2; i++) { - final StringBuilder b = new StringBuilder(); - final int length = random.nextInt(500); - for (int j = 0; j < length; j++) { - b.append(random.nextInt(10)); - } - testStrings.add(new String[]{b.toString()}); - } - - return testStrings.toArray(new String[][]{}); - } - - /** - * Gets the test files. - * - * @return the test files. - */ - @DataProvider(name = FILE_DATA_PROVIDER, parallel = true) - public static List[][] getFiles() { - final List[]> textFiles = Lists.newArrayList(); - - for (int i = 0; i < 5; i++) { - textFiles.add(new List[]{RandomFiles.getRandomFile(1000, 1341376661708488015L + i * 1000L)}); - } - - return textFiles.toArray(new List[][]{}); - } - - /** - * Validates this {@code suffixTree}. - * - * @param suffixTree {@link SuffixTree} to validate. - * @param type of elements in {@code suffixTree}. - */ - public static void validateSuffixTree(final SuffixTreeImpl suffixTree) { - - for (int suffixIndex = 0; suffixIndex < suffixTree.getElements().size(); suffixIndex++) { - - final ListIterator suffixIterator = suffixTree.elementsListIterator(suffixIndex); - final Matcher suffixMatcher = suffixTree.matcher(); - - while (suffixIterator.hasNext()) { - assertThat(suffixMatcher.matchNext(suffixIterator.next()), is(Matched.YES)); - } - } - } -} diff --git a/team/structure-responsibilities.tex b/team/structure-responsibilities.tex new file mode 100644 index 0000000..ce9ad7a --- /dev/null +++ b/team/structure-responsibilities.tex @@ -0,0 +1,25 @@ +\section{Team} + +Here we discuss the structure of the team, and the responsibilities of each member on this project. + +\subsection{Development Process} + +We used the scrum development cycle to structure our project. +This meant dividing the work into a series of \emph{sprints}, each approximately half a week long. +Usually sprint lengths are in the order of weeks, but given the short time to complete this project, short iterations were essential to a timely delivery. + +Each sprint started with a meeting: either physical, or after the end of term, electronic. +In this meeting we reviewed the progress of the tasks of the previous sprint in order to decide what could be released, and what must roll over to the next sprint. +Following the release of the previous sprint, we decided the present sprint's deadline, and discussed tasks which need doing. +Tasks were then allocated, first by preference, and then arbitrarily. + +\subsection{Structure} + +The structure of the team was equal and democratic; we followed the scrum development process to decide tasks and deadlines, and would discuss any issues as and when they presented themselves. +Due to the small size and time to complete the project, role allocation would have adversely affected the progress of the project. + +\subsection{Responsibilities} + +As discussed above, each team member assumed an equal role within the group, and we all had a joint responsibility complete the project on time. +This translated into a responsibility to complete our tasks on time and to the best of our ability, and to take an active role in scrum meetings. +Meetings were crucial to the success of the project: ensuring that the right tasks were set and allocated to the right members, as well as ensuring that the deadlines were both feasible and on track for the project submission deadline. diff --git a/testing/testing.tex b/testing/testing.tex new file mode 100644 index 0000000..7b81ccd --- /dev/null +++ b/testing/testing.tex @@ -0,0 +1,35 @@ +\section{Testing} +\label{Testing} + +To comfort us in the idea that our software is functionally correct, we used a thorough testing methodology. Because of the obvious connection between our \texttt{diff} and \texttt{patch} implementations, the two most important qualities to strive for are consistency and robustness. In other words, our software \textit{must} produce the same output if it is fed the same input multiple times. + +\paragraph{Unit Testing} +We wrote our unit tests with \texttt{TestNG}~\cite{testng}, mainly due to previous exposure. We also used the \texttt{hamcrest}~\cite{hamcrest} library. This allowed us to write powerful assertions very easily, as illustrated in the following code snippet. + +\begin{lstlisting}[caption={\texttt{diffr.suffixtree.impl.MatcherImplTest}}] +@Test(expectedExceptions = IllegalStateException.class) +public void testMatchNextElementMatcherFinished() { + final SuffixTree suffixTree = + SuffixTreeImpl.newSuffixTree(Lists.charactersOf("mississippi")); + final Matcher matcher = suffixTree.matcher(); + + for (Character c : Lists.charactersOf("issipp")) { + assertThat(matcher.matchNext(c), is(Matched.YES)); + } + + assertThat(matcher.matchNext('p'), is(Matched.NO)); + assertThat(matcher.isFinished(), is(true)); + matcher.matchNext('b'); +} +\end{lstlisting} + +Our tests cover over 90\% of the codebase. We are confident that our code is robust and thoroughly tested. Both \texttt{diffr} and \texttt{patchr} are tested individually on manually derived test files, designed to thoroughly test corner cases. + +\paragraph{Integration Testing} +\label{IntegrationTesting} +Due to the unique nature of our software (i.e. there is no reference implementation to compare results with), we had to come up with files to perform integration testing. We have concatenated all the source files from the \texttt{kernel/} directory in linux kernel, versions \texttt{2.6.27.62} and \texttt{3.2.13}, and the entire kernel version \texttt{0.1} and run \texttt{diffr} on all possible pairs (\texttt{original file}, \texttt{new file}) and \texttt{patchr} on the \texttt{original file} and patch file generated using \texttt{diffr} to see if we can get back the \texttt{new file}. + +This approach is allows us to test the integration between the two tools quite thoroughly and definitely, as we test the entire loop of generating the patch file and applying it to the original file. The integration tests give us great confidence in the correctness of our implementation. + +\paragraph{Performance Testing} +We used \texttt{caliper}~\cite{caliper}, Google's open-source framework for writing, running and viewing the results of Java Microbenchmarks. It allowed us to fine-tune the Suffix Tree implementation to guarantee high performance. diff --git a/util/pom.xml b/util/pom.xml deleted file mode 100644 index 480a01a..0000000 --- a/util/pom.xml +++ /dev/null @@ -1,74 +0,0 @@ - - - 4.0.0 - - - diffr - parent - 1.0-SNAPSHOT - - - diffr - util - 1.0-SNAPSHOT - jar - ${project.groupId}.${project.artifactId} - Utilities for diffr. - - - - com.google.guava - guava - ${guava.version} - - - javolution - javolution - ${javolution.version} - - - org.testng - testng - ${testng.version} - test - - - org.mockito - mockito-all - ${mockito.version} - test - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - test - - - - - - - org.apache.maven.plugins - maven-jar-plugin - ${jar.version} - - - jar-test-classes - package - - test-jar - - - - - - **/*Test.* - - - - - - diff --git a/util/src/main/java/diffr/util/ArgumentsProcessor.java b/util/src/main/java/diffr/util/ArgumentsProcessor.java deleted file mode 100644 index 08b55e0..0000000 --- a/util/src/main/java/diffr/util/ArgumentsProcessor.java +++ /dev/null @@ -1,45 +0,0 @@ -package diffr.util; - -import com.google.common.base.Optional; - -/** - * Processes command line arguments. - * - * @author William Martin - * @since 1.0 - */ -public class ArgumentsProcessor { - - /** - * Checks if the given arguments contain a call for help. - * - * @param args the arguments to check for help. - * @return true if the given arguments contain a call for help. - */ - public static boolean containsHelpArgument(final String... args) { - for (final String s : args) { - if (s.equalsIgnoreCase("--help") - || s.equalsIgnoreCase("-help")) { - return true; - } - } - return false; - } - - /** - * Extract the output file location from the given arguments. - * - * @param args the arguments to extract the output file location from. - * @returnthe output file location from the given arguments. - */ - public static Optional extractOutputFile(final String... args) { - String result = null; - for (int i = 0; i < args.length - 1; i++) { - final String s = args[i]; - if (s.equalsIgnoreCase("-o")) { - result = args[i + 1]; - } - } - return Optional.fromNullable(result); - } -} diff --git a/util/src/main/java/diffr/util/ListIterators.java b/util/src/main/java/diffr/util/ListIterators.java deleted file mode 100644 index 1026cdb..0000000 --- a/util/src/main/java/diffr/util/ListIterators.java +++ /dev/null @@ -1,90 +0,0 @@ -package diffr.util; - -import java.util.ListIterator; -import java.util.NoSuchElementException; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Utility methods for creating various {@link ListIterators}. - * - * @author Jakub D Kozlowski - * @since 0.1 - */ -public final class ListIterators { - - /** - * Creates an {@code listIterator} returning the first {@code limitSize} elements of the - * given {@code listIterator}. If the original {@code listIterator} does not contain that many - * elements, the returned {@code listIterator} will have the same behavior as the original - * {@code listIterator}. The returned {@code listIterator} supports {@link ListIterator#remove()}, - * {@link ListIterator#set(Object)}, {@link ListIterator#remove()} and {@link ListIterator#add(Object)} - * if the original {@code listIterator} does. Calling {@link ListIterator#previous()} decreases internal count of - * elements returned. - * - * @param listIterator the listIterator to limit. - * @param limitSize the maximum number of elements in the returned listIterator. - * - * @throws NullPointerException if {@code listIterator} is null. - * @throws IllegalArgumentException if {@code limitSize} is negative. - */ - public static ListIterator limit(final ListIterator listIterator, final int limitSize) { - checkNotNull(listIterator); - checkArgument(limitSize >= 0); - return new ListIterator() { - - private int count; - - @Override - public boolean hasNext() { - return count < limitSize && listIterator.hasNext(); - } - - @Override - public E next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - count++; - return listIterator.next(); - } - - @Override - public boolean hasPrevious() { - return listIterator.hasPrevious(); - } - - @Override - public E previous() { - count--; - return listIterator.previous(); - } - - @Override - public int nextIndex() { - return listIterator.nextIndex(); - } - - @Override - public int previousIndex() { - return listIterator.previousIndex(); - } - - @Override - public void remove() { - listIterator.remove(); - } - - @Override - public void set(E e) { - listIterator.set(e); - } - - @Override - public void add(E e) { - listIterator.add(e); - } - }; - } -} diff --git a/util/src/main/java/diffr/util/instruction/CopyInstruction.java b/util/src/main/java/diffr/util/instruction/CopyInstruction.java deleted file mode 100644 index dfd13df..0000000 --- a/util/src/main/java/diffr/util/instruction/CopyInstruction.java +++ /dev/null @@ -1,49 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.collect.Range; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * {@link Instruction} for copying a set of lines from the original file. - *

Example output:
- * 7,10
- * 15,18

- * - * @author William Martin - * @since 0.1 - */ -public class CopyInstruction implements Instruction { - - private final Range range; - - /** - * Default constructor. - * - * @param range the range of this copy. - */ - public CopyInstruction(final Range range) { - checkNotNull(range); - checkArgument(range.hasLowerBound()); - checkArgument(range.lowerEndpoint() >= 0); - this.range = range; - } - - /** - * Gets the range. - * - * @return the range. - */ - public Range getRange() { - return range; - } - - /** - * {@inheritDoc} - */ - @Override - public Type getType() { - return Type.Copy; - } -} diff --git a/util/src/main/java/diffr/util/instruction/IllegalPatchInstructionException.java b/util/src/main/java/diffr/util/instruction/IllegalPatchInstructionException.java deleted file mode 100644 index 0b0c58b..0000000 --- a/util/src/main/java/diffr/util/instruction/IllegalPatchInstructionException.java +++ /dev/null @@ -1,20 +0,0 @@ -package diffr.util.instruction; - -/** - * Exception that announces an Illegal Instruction. - * - * @author William Martin - * @since 1.0 - */ -public class IllegalPatchInstructionException extends Exception { - - public static final String MESSAGE = "Error. Illegal instruction."; - - public IllegalPatchInstructionException(final String text) { - super(text); - } - - public IllegalPatchInstructionException() { - super(MESSAGE); - } -} diff --git a/util/src/main/java/diffr/util/instruction/InsertInstruction.java b/util/src/main/java/diffr/util/instruction/InsertInstruction.java deleted file mode 100644 index 3735281..0000000 --- a/util/src/main/java/diffr/util/instruction/InsertInstruction.java +++ /dev/null @@ -1,41 +0,0 @@ -package diffr.util.instruction; - -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * {@link Instruction} for inserting some text. - * - * @author William Martin - * @since 0.1 - */ -public class InsertInstruction implements Instruction { - - private final String text; - - /** - * Default constructor. - * - * @param text the text to insert. - */ - public InsertInstruction(final String text) { - checkNotNull(text); - this.text = text; - } - - /** - * Gets the text. - * - * @return the text. - */ - public String getText() { - return text; - } - - /** - * {@inheritDoc} - */ - @Override - public Type getType() { - return Type.Insert; - } -} diff --git a/util/src/main/java/diffr/util/instruction/Instruction.java b/util/src/main/java/diffr/util/instruction/Instruction.java deleted file mode 100644 index 848e473..0000000 --- a/util/src/main/java/diffr/util/instruction/Instruction.java +++ /dev/null @@ -1,24 +0,0 @@ -package diffr.util.instruction; - -/** - * Class for an instruction for creating the new file. - * - * @author William Martin - * @since 0.1 - */ -public interface Instruction { - - /** - * Enumerates the possible Instruction types. - */ - public enum Type { - Copy, Insert; - } - - /** - * Gets the type of this Instruction. - * - * @return the type of this Instruction. - */ - public Type getType(); -} diff --git a/util/src/main/java/diffr/util/instruction/InstructionComposer.java b/util/src/main/java/diffr/util/instruction/InstructionComposer.java deleted file mode 100644 index 7597e19..0000000 --- a/util/src/main/java/diffr/util/instruction/InstructionComposer.java +++ /dev/null @@ -1,37 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.collect.Range; - -/** - * Composes a string representation of an Instruction. - * - * @author William Martin - * @since 0.2 - */ -public class InstructionComposer { - - public static final String COMMA = ",", INSERT = "> "; - - /** - * Composes a string representation of the given Instruction. - * - * @param instruction the Instruction to convert. - * @return the Instruction as a string. - */ - public static String composeString(final Instruction instruction) { - final StringBuilder stringBuilder = new StringBuilder(); - switch (instruction.getType()) { - case Copy: - final Range range = ((CopyInstruction) instruction).getRange(); - stringBuilder.append(range.lowerEndpoint()); - stringBuilder.append(COMMA); - stringBuilder.append(range.upperEndpoint()); - break; - case Insert: - final String text = ((InsertInstruction) instruction).getText(); - stringBuilder.append(INSERT); - stringBuilder.append(text); - } - return stringBuilder.toString(); - } -} diff --git a/util/src/main/java/diffr/util/instruction/InstructionParser.java b/util/src/main/java/diffr/util/instruction/InstructionParser.java deleted file mode 100644 index 80bd5a2..0000000 --- a/util/src/main/java/diffr/util/instruction/InstructionParser.java +++ /dev/null @@ -1,61 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; - -/** - * Parses {@link Instruction}s. - * - * @author William Martin - * @since 0.2 - */ -public class InstructionParser { - - public static final String NUMBER_REGEX = "(0|([1-9]\\d*))", - COPY_REGEX = NUMBER_REGEX + "," + NUMBER_REGEX, - INSERT_REGEX = "> [\\S\\s]*"; - - /** - * Parses the given string into a CopyInstruction if it matches this Instruction's regex. - * - * @param text the text to parse. - * @return the CopyInstruction. - */ - @VisibleForTesting - static CopyInstruction parseCopyInstruction(final String text) { - final String[] segments = text.split(InstructionComposer.COMMA); - final int fromID = Integer.parseInt(segments[0]); - final int toID = Integer.parseInt(segments[1]); - final Range range = Ranges.closed(fromID, toID); - return new CopyInstruction(range); - } - - /** - * Text constructor. Parses the given string into a CopyInstruction if it matches this Instruction's regex. - * - * @param text the text to parse. - * @return the InsertInstruction. - */ - @VisibleForTesting - static InsertInstruction parseInsertInstruction(final String text) { - return new InsertInstruction(text.substring(2)); - } - - /** - * Parses the given text for an Instruction. - * - * @param text the text to parse for an Instruction. - * @return the Instruction contained in the text, or null if no Instruction is found. - */ - public static Optional parseInstruction(final String text) { - Instruction instruction = null; - if (text.matches(COPY_REGEX)) { - instruction = parseCopyInstruction(text); - } else if (text.matches(INSERT_REGEX)) { - instruction = parseInsertInstruction(text); - } - return Optional.fromNullable(instruction); - } -} diff --git a/util/src/main/java/diffr/util/instruction/Instructions.java b/util/src/main/java/diffr/util/instruction/Instructions.java deleted file mode 100644 index 41a3164..0000000 --- a/util/src/main/java/diffr/util/instruction/Instructions.java +++ /dev/null @@ -1,73 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.base.Optional; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -/** - * Factory methods for Instruction classes. - * - * @author William Martin - * @since 0.2 - */ -public class Instructions { - - /** - * Writes the given Instruction to the supplied writer and ends the line. - * - * @param instruction the Instruction to write. - * @param writer writer to write to. - * @throws java.io.IOException if there was an error when writing. - * @see {@link java.io.BufferedWriter#write(String)} - */ - public static void writeInstruction(final Instruction instruction, - final BufferedWriter writer) throws IOException { - final String string = InstructionComposer.composeString(instruction); - writer.write(string); - writer.write("\n"); - } - - /** - * Reads the next {@link diffr.util.instruction.Instruction} in the input. - * - * @param reader input to read from. - * @return the next {@link diffr.util.instruction.Instruction} in the input, - * or null if the end of the input is reached. - * @throws IOException if an error reading the input occurred. - */ - public static Optional readInstruction(final BufferedReader reader) throws IOException { - final String line = reader.readLine(); - if (null != line) { - try { - return InstructionParser.parseInstruction(line); - } catch (final IllegalArgumentException iae) { - } - } - return Optional.absent(); - } - - /** - * Reads the given list of Strings into a list of Instructions. - * - * @param patch the list of Strings. - * @return a list of Instructions. - * @throws - */ - public static List readInstructions(final List patch) - throws IllegalPatchInstructionException { - final List instructions = new ArrayList(patch.size()); - for (final String line : patch) { - final Optional instruction = InstructionParser.parseInstruction(line); - if (instruction.isPresent()) { - instructions.add(instruction.get()); - } else { - throw new IllegalPatchInstructionException("Error. Illegal patch Instruction: " + line); - } - } - return instructions; - } -} diff --git a/util/src/test/java/diffr/util/ArgumentsProcessorTest.java b/util/src/test/java/diffr/util/ArgumentsProcessorTest.java deleted file mode 100644 index e6a902d..0000000 --- a/util/src/test/java/diffr/util/ArgumentsProcessorTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package diffr.util; - -import com.google.common.collect.Lists; -import org.testng.annotations.Test; - -import java.util.List; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link diffr.util.ArgumentsProcessor}. - * - * @author William Martin - * @since 1.0 - */ -public class ArgumentsProcessorTest { - - /** - * Tests whether the contains help argument method works correctly. - */ - @Test - public void testContainsHelpArgument() { - final List valid = Lists.newArrayList("--hElp", "me", "please"); - final List inValid = Lists.newArrayList("hl-p", "me", "please"); - assertThat(ArgumentsProcessor.containsHelpArgument(valid.toArray(new String[0])), is(true)); - assertThat(ArgumentsProcessor.containsHelpArgument(inValid.toArray(new String[0])), is(false)); - } - - /** - * Tests whether the extract output file method works correctly. - */ - @Test - public void testExtractOutputFile() { - final List valid = Lists.newArrayList("-hElp!!----", "-O", "please"); - final List inValid = Lists.newArrayList("-ohlp", "me", "-o"); - assertThat(ArgumentsProcessor.extractOutputFile(valid.toArray(new String[0])).isPresent(), is(true)); - assertThat(ArgumentsProcessor.extractOutputFile(inValid.toArray(new String[0])).isPresent(), is(false)); - } -} diff --git a/util/src/test/java/diffr/util/ListIteratorsTest.java b/util/src/test/java/diffr/util/ListIteratorsTest.java deleted file mode 100644 index 1c565dd..0000000 --- a/util/src/test/java/diffr/util/ListIteratorsTest.java +++ /dev/null @@ -1,47 +0,0 @@ -package diffr.util; - -import com.google.common.collect.Lists; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.ListIterator; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; -import static org.mockito.Mockito.mock; - -/** - * Tests {@link ListIteratorsTest}. - * - * @author Jakub D Kozlowski - * @since 0.2 - */ -public class ListIteratorsTest { - - @Test(expectedExceptions = NullPointerException.class) - public void testLimitNullListIterator() { - ListIterators.limit(null, 0); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testLimitNegativeLimitSize() { - ListIterators.limit(mock(ListIterator.class), -1); - } - - @Test - public void testLimitWithoutPrevious() { - final List fullList = Lists.newArrayList(1, 2, 3, 4, 5); - final ListIterator subListIterator = fullList.listIterator(1); - final ListIterator limitSubListIterator = ListIterators.limit(fullList.listIterator(1), 3); - - for (int i = 0; i < 3; i++) { - final Integer subListElement = subListIterator.next(); - final Integer limitSubListElement = limitSubListIterator.next(); - assertThat(subListIterator.previousIndex(), is(limitSubListIterator.previousIndex())); - assertThat(subListIterator.nextIndex(), is(limitSubListIterator.nextIndex())); - assertThat(subListElement, is(limitSubListElement)); - } - - assertThat(limitSubListIterator.hasNext(), is(false)); - } -} diff --git a/util/src/test/java/diffr/util/RandomFiles.java b/util/src/test/java/diffr/util/RandomFiles.java deleted file mode 100644 index 56b1cf6..0000000 --- a/util/src/test/java/diffr/util/RandomFiles.java +++ /dev/null @@ -1,53 +0,0 @@ -package diffr.util; - -import com.google.common.collect.Lists; -import javolution.text.Text; - -import java.util.List; -import java.util.Random; - -/** - * Utilities for generating random files. - * - * @author Jakub D Kozlowski - * @since 0.3 - */ -public final class RandomFiles { - - /** - * Gets a randomly generated file of this {@code fileLength} using this {@code seed} to initialise the random - * number generator. - * - * @param fileLength fileLength of the file to generate. - * @param seed initial seed. - * - * @return random file of this {@code fileLength}, generated from this {@code seed}. - */ - public static List getRandomFile(final long fileLength, final long seed) { - - final List testFile = Lists.newArrayList(); - - final Random random = new Random(seed); - - final char minChar = 33; - final int charRange = 127 - 33 + 1; - - for (int i = 0; i < fileLength; i++) { - - if (0 == random.nextInt(4) || 0 == testFile.size()) { - - final StringBuilder b = new StringBuilder(); - final int lineLength = random.nextInt(100) + 1; - for (int j = 0; j < lineLength; j++) { - b.append((char) (minChar + (random.nextInt(charRange)))); - } - testFile.add(new Text(b.toString())); - } - else { - testFile.add(testFile.get(random.nextInt(testFile.size()))); - } - } - - return testFile; - } -} diff --git a/util/src/test/java/diffr/util/instruction/CopyInstructionTest.java b/util/src/test/java/diffr/util/instruction/CopyInstructionTest.java deleted file mode 100644 index 9a51acb..0000000 --- a/util/src/test/java/diffr/util/instruction/CopyInstructionTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.collect.Range; -import com.google.common.collect.Ranges; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link CopyInstruction} - * - * @author William Martin - * @since 0.1 - */ -public class CopyInstructionTest { - - private int fromID, toID; - private Range range; - private CopyInstruction copyInstruction; - - @BeforeMethod - public void setUp() { - fromID = 5; - toID = 10; - range = Ranges.closed(fromID, toID); - copyInstruction = new CopyInstruction(range); - } - - /** - * Tests whether the constructor works correctly with negative from ID. - */ - @Test(expectedExceptions = IllegalArgumentException.class) - public void testConstructorNegativeFromID() { - new CopyInstruction(Ranges.closed(-1, toID)); - } - - /** - * Tests whether the constructor works correctly with illegal range. - */ - @Test(expectedExceptions = IllegalArgumentException.class) - public void testSetFromIDExceptionRange() { - new CopyInstruction(Ranges.closed(toID + 1, toID)); - } - - /** - * Tests whether the range getter works correctly. - */ - @Test - public void testRange() { - assertThat(copyInstruction.getRange(), is(range)); - } -} diff --git a/util/src/test/java/diffr/util/instruction/IllegalPatchInstructionExceptionTest.java b/util/src/test/java/diffr/util/instruction/IllegalPatchInstructionExceptionTest.java deleted file mode 100644 index 70ff5bc..0000000 --- a/util/src/test/java/diffr/util/instruction/IllegalPatchInstructionExceptionTest.java +++ /dev/null @@ -1,32 +0,0 @@ -package diffr.util.instruction; - -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link IllegalPatchInstructionException}. - * - * @author William Martin - * @since 1.0 - */ -public class IllegalPatchInstructionExceptionTest { - - /** - * Tests the default message. - */ - @Test - public void testDefaultMessage() { - assertThat(new IllegalPatchInstructionException().getMessage(), is(IllegalPatchInstructionException.MESSAGE)); - } - - /** - * Tests a custom message. - */ - @Test - public void testMessage() { - final String testMessage = "test message"; - assertThat(new IllegalPatchInstructionException(testMessage).getMessage(), is(testMessage)); - } -} diff --git a/util/src/test/java/diffr/util/instruction/InsertInstructionTest.java b/util/src/test/java/diffr/util/instruction/InsertInstructionTest.java deleted file mode 100644 index d1b0a83..0000000 --- a/util/src/test/java/diffr/util/instruction/InsertInstructionTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package diffr.util.instruction; - -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link InsertInstruction} - * - * @author William Martin - * @since 0.1 - */ -public class InsertInstructionTest { - - private String text; - private InsertInstruction insertInstruction; - - @BeforeMethod - public void setUp() { - text = "hello world"; - insertInstruction = new InsertInstruction(text); - } - - /** - * Tests whether an exception is thrown when the constructor is invoked with null text. - */ - @Test(expectedExceptions = NullPointerException.class) - public void testNullConstructorArgument() { - new InsertInstruction(null); - } - - /** - * Tests whether the text getter works correctly. - */ - @Test - public void testGetText() { - assertThat(text, is(insertInstruction.getText())); - } -} diff --git a/util/src/test/java/diffr/util/instruction/InstructionComposerTest.java b/util/src/test/java/diffr/util/instruction/InstructionComposerTest.java deleted file mode 100644 index 7901331..0000000 --- a/util/src/test/java/diffr/util/instruction/InstructionComposerTest.java +++ /dev/null @@ -1,33 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.collect.Ranges; -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link diffr.util.instruction.InstructionComposer} - * - * @author William Martin - * @since 0.2 - */ -public class InstructionComposerTest { - - Instruction instruction; - - /** - * Tests whether the compose string method works correctly. - */ - @Test - public void testStringComposer() { - final String text = "hello world"; - instruction = new InsertInstruction(text); - final String converted = InstructionComposer.composeString(instruction); - assertThat(converted.matches(InstructionParser.INSERT_REGEX), is(true)); - - instruction = new CopyInstruction(Ranges.closed(5, 10)); - final String formattedString = InstructionComposer.composeString(instruction); - assertThat(formattedString.matches(InstructionParser.COPY_REGEX), is(true)); - } -} diff --git a/util/src/test/java/diffr/util/instruction/InstructionParserTest.java b/util/src/test/java/diffr/util/instruction/InstructionParserTest.java deleted file mode 100644 index 5a33e02..0000000 --- a/util/src/test/java/diffr/util/instruction/InstructionParserTest.java +++ /dev/null @@ -1,97 +0,0 @@ -package diffr.util.instruction; - -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link diffr.util.instruction.InstructionParser} - * - * @author William Martin - * @since 0.2 - */ -public class InstructionParserTest { - - private InstructionParser instructionParser; - public static final String[] trueInsertExamples = {"> hello", - "> 1245oweifhwoeijoij//fwefwgtwgt", "> \n\n\n\n"}; - public static final String[] falseInsertExamples = {">hello", ">\t\n", ">\n", "><", - ">>", ">", " >", "hello", "123\n", "\t> hello"}; - public static final String[] trueCopyExamples = {"5,41", "50,425", "234565,2344000", "0,0"}; - public static final String[] falseCopyExamples = {"05,4", "5f,4", ",0", ",", - "0,", "hello sir!", " 3,4", "4,2 ", "4, 2", "4 ,2"}; - - @BeforeMethod - public void setUp() { - instructionParser = new InstructionParser(); - } - - /** - * Tests whether the parse copy instruction method works correctly. - */ - @Test - public void testParseCopyInstruction() { - for (final String s : trueCopyExamples) { - final CopyInstruction copyInstruction = - instructionParser.parseCopyInstruction(s); - assertThat(InstructionComposer.composeString(copyInstruction), is(s)); - } - } - - /** - * Tests whether the parse insert instruction method works correctly. - */ - @Test - public void testParseInsertInstruction() { - for (final String s : trueInsertExamples) { - final InsertInstruction insertInstruction = - instructionParser.parseInsertInstruction(s); - assertThat(InstructionComposer.composeString(insertInstruction), is(s)); - } - } - - /** - * Tests whether the parse instruction method works correctly. - */ - @Test - public void testParseInstruction() { - for (final String s : trueInsertExamples) { - final Instruction instruction = - instructionParser.parseInstruction(s).get(); - assertThat(InstructionComposer.composeString(instruction), is(s)); - } - for (final String s : trueCopyExamples) { - final Instruction instruction = - instructionParser.parseInstruction(s).get(); - assertThat(InstructionComposer.composeString(instruction), is(s)); - } - } - - /** - * Tests whether the static regex for InsertInstruction works correctly. - */ - @Test - public void testInsertRegex() { - for (final String s : trueInsertExamples) { - assertThat(s.matches(InstructionParser.INSERT_REGEX), is(true)); - } - for (final String s : falseInsertExamples) { - assertThat(s.matches(InstructionParser.INSERT_REGEX), is(false)); - } - } - - /** - * Tests whether the static regex for CopyInstruction works correctly. - */ - @Test - public void testCopyRegex() { - for (final String s : trueCopyExamples) { - assertThat(s.matches(InstructionParser.COPY_REGEX), is(true)); - } - for (final String s : falseCopyExamples) { - assertThat(s.matches(InstructionParser.COPY_REGEX), is(false)); - } - } -} diff --git a/util/src/test/java/diffr/util/instruction/InstructionsTest.java b/util/src/test/java/diffr/util/instruction/InstructionsTest.java deleted file mode 100644 index 96f13ee..0000000 --- a/util/src/test/java/diffr/util/instruction/InstructionsTest.java +++ /dev/null @@ -1,105 +0,0 @@ -package diffr.util.instruction; - -import com.google.common.base.Optional; -import com.google.common.collect.Lists; -import org.testng.annotations.Test; - -import java.io.*; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.is; - -/** - * Tests {@link InstructionParser} - * - * @author William Martin - * @since 0.2 - */ -public class InstructionsTest { - - BufferedReader reader; - - /** - * Tests whether the read instruction method works correctly. - */ - @Test - public void testReadInstruction() throws IOException { - final String text = "3,4\n> hello world\n1,2\nbroken line!\n>anotherbrokenline\n\n"; - reader = new BufferedReader(new StringReader(text)); - - assertThat(Instructions.readInstruction(reader).get(), - is(CopyInstruction.class)); - assertThat(Instructions.readInstruction(reader).get(), - is(InsertInstruction.class)); - assertThat(Instructions.readInstruction(reader).get(), - is(CopyInstruction.class)); - assertThat(Instructions.readInstruction(reader).isPresent(), - is(false)); - assertThat(Instructions.readInstruction(reader).isPresent(), - is(false)); - assertThat(Instructions.readInstruction(reader).isPresent(), - is(false)); - assertThat(Instructions.readInstruction(reader).isPresent(), - is(false)); - } - - /** - * Tests whether the parse copy instruction method works correctly. - */ - @Test - public void testWriteInstruction() throws IOException { - final List instructionList = new ArrayList(); - final InstructionParser parser = new InstructionParser(); - for (final String s : InstructionParserTest.trueCopyExamples) { - instructionList.add(parser.parseInstruction(s).get()); - } - for (final String s : InstructionParserTest.trueInsertExamples) { - instructionList.add(parser.parseInstruction(s).get()); - } - Collections.shuffle(instructionList); - final StringWriter stringWriter = new StringWriter(); - final BufferedWriter writer = new BufferedWriter(stringWriter); - - for (final Instruction instruction : instructionList) { - Instructions.writeInstruction(instruction, writer); - } - final String result = stringWriter.toString(); - reader = new BufferedReader(new StringReader(result)); - Optional instruction = null; - final List resultList = new ArrayList(); - while ((instruction = Instructions.readInstruction(reader)).isPresent()) { - resultList.add(instruction.get()); - } - for ( - Iterator it1 = instructionList.iterator(), - it2 = resultList.iterator(); - it1.hasNext() && it2.hasNext(); - ) { - assertThat(it1.next() == it2.next(), is(true)); - } - } - - /** - * Tests whether the read instructions method works correctly. - */ - @Test - public void testReadInstructions() throws IOException, IllegalPatchInstructionException { - final List lines = Lists.newArrayList("3,4", "> hello world", "1,2", "11,12", "> okline"); - final List instructions = Instructions.readInstructions(lines); - assertThat(instructions.get(0), is(CopyInstruction.class)); - assertThat(instructions.get(1), is(InsertInstruction.class)); - assertThat(instructions.get(2), is(CopyInstruction.class)); - assertThat(instructions.get(3), is(CopyInstruction.class)); - assertThat(instructions.get(4), is(InsertInstruction.class)); - } - - @Test(expectedExceptions = IllegalPatchInstructionException.class) - public void testReadInstructionsException() throws IllegalPatchInstructionException { - final List broken = Lists.newArrayList("brokeninstruction"); - Instructions.readInstructions(broken); - } -}